diff --git a/CMakeLists.txt b/CMakeLists.txt index 067d7d9435..e8067bda25 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -37,6 +37,7 @@ option(MGARD_ENABLE_OPENMP "Enable OpenMP support." OFF) option(MGARD_ENABLE_CUDA "Enable CUDA support" OFF) option(MGARD_ENABLE_SERIAL "Enable SERIAL support" ON) option(MGARD_ENABLE_HIP "Enable HIP support" OFF) +option(MGARD_ENABLE_SYCL "Enable SYCL support" OFF) option(MGARD_ENABLE_LEGACY_CUDA "Enable legacy CUDA support" OFF) option(MGARD_ENABLE_CLI "Build executable." OFF) @@ -88,11 +89,18 @@ endif() if (MGARD_ENABLE_SERIAL OR MGARD_ENABLE_CUDA OR - MGARD_ENABLE_HIP ) + MGARD_ENABLE_HIP OR + MGARD_ENABLE_SYCL) + + if(MGARD_ENABLE_SYCL) + set(MGARD_ENABLE_SERIAL OFF) + endif() set (CMAKE_CXX_STANDARD 17) set (CMAKE_CXX_STANDARD_REQUIRED ON) set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -w") + + include(MgardXGenerateSource) add_subdirectory (src/mgard-x) add_subdirectory (include/mgard-x) add_subdirectory (include/mgard-x/MDR) @@ -178,6 +186,12 @@ if (MGARD_ENABLE_HIP) set_source_files_properties(${MGARD_X_HIP_SRC} PROPERTIES LANGUAGE HIP) endif() +if (MGARD_ENABLE_SYCL) + # No need to link with sycl libraries for now + # find_package(IntelDPCPP REQUIRED) + set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -w") +endif() + find_package(ZLIB REQUIRED) find_package(zstd) @@ -271,6 +285,7 @@ target_sources( ${MGARD_X_SERIAL_SRC} ${MGARD_X_CUDA_SRC} ${MGARD_X_HIP_SRC} + ${MGARD_X_SYCL_SRC} ) set_target_properties(mgard-library PROPERTIES CUDA_RESOLVE_DEVICE_SYMBOLS ON) @@ -450,6 +465,7 @@ if (MGARD_ENABLE_HIP) message(STATUS "HIP Arch: ${CMAKE_HIP_ARCHITECTURES}") list(POP_BACK CMAKE_MESSAGE_INDENT) endif() +message(STATUS "SYCL: ${MGARD_ENABLE_SYCL}") list(POP_BACK CMAKE_MESSAGE_INDENT) message(STATUS "LEGACY CUDA: ${MGARD_ENABLE_LEGACY_CUDA}") if (MGARD_ENABLE_LEGACY_CUDA) diff --git a/build_scripts/build_mgard_cuda_legacy.sh b/build_scripts/build_mgard_cuda_legacy.sh new file mode 100755 index 0000000000..3663c8b4c1 --- /dev/null +++ b/build_scripts/build_mgard_cuda_legacy.sh @@ -0,0 +1,82 @@ +#!/bin/sh + +# Copyright 2021, Oak Ridge National Laboratory. +# MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs +# Author: Jieyang Chen (chenj3@ornl.gov) +# Date: April 2, 2021 +# Script for building MGARD-X + +set -e +set -x + +######## User Configurations ######## +# Source directory +mgard_x_src_dir=. +# Build directory +build_dir=./build-cuda-turing +# Number of processors used for building +num_build_procs=8 +# Installtaion directory +install_dir=./install-cuda-turing + + +#build NVCOMP +nvcomp_dir=${build_dir}/nvcomp +nvcomp_src_dir=${nvcomp_dir}/src +nvcomp_build_dir=${nvcomp_dir}/build +nvcomp_install_dir=${install_dir} +if [ ! -d "${nvcomp_src_dir}" ]; then + git clone -b v2.2.0 https://github.com/NVIDIA/nvcomp.git ${nvcomp_src_dir} +fi +mkdir -p ${nvcomp_build_dir} +cmake -S ${nvcomp_src_dir} -B ${nvcomp_build_dir}\ + -DCMAKE_INSTALL_PREFIX=${nvcomp_install_dir} +cmake --build ${nvcomp_build_dir} -j ${num_build_procs} +cmake --install ${nvcomp_build_dir} + +#build ZSTD +zstd_dir=${build_dir}/zstd +zstd_src_dir=${zstd_dir}/src +zstd_build_dir=${zstd_dir}/build +zstd_install_dir=${install_dir} +if [ ! -d "${zstd_src_dir}" ]; then + git clone -b v1.5.0 https://github.com/facebook/zstd.git ${zstd_src_dir} +fi +mkdir -p ${zstd_build_dir} +cmake -S ${zstd_src_dir}/build/cmake -B ${zstd_build_dir}\ + -DZSTD_MULTITHREAD_SUPPORT=ON\ + -DCMAKE_INSTALL_LIBDIR=lib\ + -DCMAKE_INSTALL_PREFIX=${zstd_install_dir} +cmake --build ${zstd_build_dir} -j ${num_build_procs} +cmake --install ${zstd_build_dir} + +#build Protobuf +protobuf_dir=${build_dir}/protobuf +protobuf_src_dir=${protobuf_dir}/src +protobuf_build_dir=${protobuf_dir}/build +protobuf_install_dir=${install_dir} +if [ ! -d "${protobuf_src_dir}" ]; then + git clone -b v3.19.4 --recurse-submodules https://github.com/protocolbuffers/protobuf.git ${protobuf_src_dir} +fi +mkdir -p ${protobuf_build_dir} +cmake -S ${protobuf_src_dir}/cmake -B ${protobuf_build_dir}\ + -Dprotobuf_BUILD_SHARED_LIBS=ON\ + -DCMAKE_INSTALL_PREFIX=${protobuf_install_dir} +cmake --build ${protobuf_build_dir} -j ${num_build_procs} +cmake --install ${protobuf_build_dir} + + +#build MGARD +mgard_x_build_dir=${build_dir}/mgard +mgard_x_install_dir=${install_dir} +mkdir -p ${mgard_x_build_dir} +cmake -S ${mgard_x_src_dir} -B ${mgard_x_build_dir} \ + -DCMAKE_PREFIX_PATH="${nvcomp_install_dir};${zstd_install_dir}/lib/cmake/zstd;${protobuf_install_dir}"\ + -DMGARD_ENABLE_SERIAL=OFF\ + -DMGARD_ENABLE_LEGACY_CUDA=ON\ + -DCMAKE_CUDA_ARCHITECTURES="75"\ + -DMGARD_ENABLE_DOCS=OFF\ + -DCMAKE_BUILD_TYPE=Release\ + -DCMAKE_INSTALL_PREFIX=${mgard_x_install_dir} +cmake --build ${mgard_x_build_dir} -j ${num_build_procs} +cmake --install ${mgard_x_build_dir} diff --git a/build_scripts/build_mgard_cuda_turing.sh b/build_scripts/build_mgard_cuda_turing.sh index 181398a53e..788b1d483b 100755 --- a/build_scripts/build_mgard_cuda_turing.sh +++ b/build_scripts/build_mgard_cuda_turing.sh @@ -15,7 +15,7 @@ mgard_x_src_dir=. # Build directory build_dir=./build-cuda-turing # Number of processors used for building -num_build_procs=8 +num_build_procs=16 # Installtaion directory install_dir=./install-cuda-turing diff --git a/build_scripts/build_mgard_sycl_gen9.sh b/build_scripts/build_mgard_sycl_gen9.sh new file mode 100755 index 0000000000..00d0620859 --- /dev/null +++ b/build_scripts/build_mgard_sycl_gen9.sh @@ -0,0 +1,74 @@ +#!/bin/sh + +# Copyright 2021, Oak Ridge National Laboratory. +# MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs +# Author: Jieyang Chen (chenj3@ornl.gov) +# Date: April 2, 2021 +# Script for building MGARD-X + +set -e +set -x + +######## User Configurations ######## +# Source directory +mgard_x_src_dir=. +# Build directory +build_dir=./build-sycl-gen9 +# Number of processors used for building +num_build_procs=8 +# Installtaion directory +install_dir=./install-sycl-gen9 + + +#build ZSTD +zstd_dir=${build_dir}/zstd +zstd_src_dir=${zstd_dir}/src +zstd_build_dir=${zstd_dir}/build +zstd_install_dir=${install_dir} +if [ ! -d "${zstd_src_dir}" ]; then + git clone -b v1.5.0 https://github.com/facebook/zstd.git ${zstd_src_dir} +fi +mkdir -p ${zstd_build_dir} +cmake -S ${zstd_src_dir}/build/cmake -B ${zstd_build_dir}\ + -DZSTD_MULTITHREAD_SUPPORT=ON\ + -DCMAKE_INSTALL_LIBDIR=lib\ + -DCMAKE_CXX_COMPILER=icpx\ + -DCMAKE_C_COMPILER=icx\ + -DCMAKE_INSTALL_PREFIX=${zstd_install_dir} +cmake --build ${zstd_build_dir} -j ${num_build_procs} +cmake --install ${zstd_build_dir} + +#build Protobuf +protobuf_dir=${build_dir}/protobuf +protobuf_src_dir=${protobuf_dir}/src +protobuf_build_dir=${protobuf_dir}/build +protobuf_install_dir=${install_dir} +if [ ! -d "${protobuf_src_dir}" ]; then + git clone -b v3.19.4 --recurse-submodules https://github.com/protocolbuffers/protobuf.git ${protobuf_src_dir} +fi +mkdir -p ${protobuf_build_dir} +cmake -S ${protobuf_src_dir}/cmake -B ${protobuf_build_dir}\ + -Dprotobuf_BUILD_SHARED_LIBS=ON\ + -DCMAKE_CXX_COMPILER=icpx\ + -DCMAKE_C_COMPILER=icx\ + -DCMAKE_INSTALL_PREFIX=${protobuf_install_dir} +cmake --build ${protobuf_build_dir} -j ${num_build_procs} +cmake --install ${protobuf_build_dir} + + +#build MGARD +mgard_x_build_dir=${build_dir}/mgard +mgard_x_install_dir=${install_dir} +mkdir -p ${mgard_x_build_dir} +cmake -S ${mgard_x_src_dir} -B ${mgard_x_build_dir} \ + -DCMAKE_PREFIX_PATH="${zstd_install_dir}/lib/cmake/zstd;${protobuf_install_dir}"\ + -DMGARD_ENABLE_SERIAL=OFF\ + -DMGARD_ENABLE_SYCL=ON\ + -DCMAKE_CXX_COMPILER=icpx\ + -DCMAKE_C_COMPILER=icx\ + -DCMAKE_CXX_FLAGS="-O2 -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend \"-device gen9\""\ + -DMGARD_ENABLE_DOCS=OFF\ + -DCMAKE_BUILD_TYPE=Release\ + -DCMAKE_INSTALL_PREFIX=${mgard_x_install_dir} +cmake --build ${mgard_x_build_dir} -j ${num_build_procs} +cmake --install ${mgard_x_build_dir} diff --git a/build_scripts/build_mgard_sycl_x86.sh b/build_scripts/build_mgard_sycl_x86.sh new file mode 100755 index 0000000000..4076053883 --- /dev/null +++ b/build_scripts/build_mgard_sycl_x86.sh @@ -0,0 +1,74 @@ +#!/bin/sh + +# Copyright 2021, Oak Ridge National Laboratory. +# MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs +# Author: Jieyang Chen (chenj3@ornl.gov) +# Date: April 2, 2021 +# Script for building MGARD-X + +set -e +set -x + +######## User Configurations ######## +# Source directory +mgard_x_src_dir=. +# Build directory +build_dir=./build-sycl-x86 +# Number of processors used for building +num_build_procs=8 +# Installtaion directory +install_dir=./install-sycl-x86 + + +#build ZSTD +zstd_dir=${build_dir}/zstd +zstd_src_dir=${zstd_dir}/src +zstd_build_dir=${zstd_dir}/build +zstd_install_dir=${install_dir} +if [ ! -d "${zstd_src_dir}" ]; then + git clone -b v1.5.0 https://github.com/facebook/zstd.git ${zstd_src_dir} +fi +mkdir -p ${zstd_build_dir} +cmake -S ${zstd_src_dir}/build/cmake -B ${zstd_build_dir}\ + -DZSTD_MULTITHREAD_SUPPORT=ON\ + -DCMAKE_INSTALL_LIBDIR=lib\ + -DCMAKE_CXX_COMPILER=icpx\ + -DCMAKE_C_COMPILER=icx\ + -DCMAKE_INSTALL_PREFIX=${zstd_install_dir} +cmake --build ${zstd_build_dir} -j ${num_build_procs} +cmake --install ${zstd_build_dir} + +#build Protobuf +protobuf_dir=${build_dir}/protobuf +protobuf_src_dir=${protobuf_dir}/src +protobuf_build_dir=${protobuf_dir}/build +protobuf_install_dir=${install_dir} +if [ ! -d "${protobuf_src_dir}" ]; then + git clone -b v3.19.4 --recurse-submodules https://github.com/protocolbuffers/protobuf.git ${protobuf_src_dir} +fi +mkdir -p ${protobuf_build_dir} +cmake -S ${protobuf_src_dir}/cmake -B ${protobuf_build_dir}\ + -Dprotobuf_BUILD_SHARED_LIBS=ON\ + -DCMAKE_CXX_COMPILER=icpx\ + -DCMAKE_C_COMPILER=icx\ + -DCMAKE_INSTALL_PREFIX=${protobuf_install_dir} +cmake --build ${protobuf_build_dir} -j ${num_build_procs} +cmake --install ${protobuf_build_dir} + + +#build MGARD +mgard_x_build_dir=${build_dir}/mgard +mgard_x_install_dir=${install_dir} +mkdir -p ${mgard_x_build_dir} +cmake -S ${mgard_x_src_dir} -B ${mgard_x_build_dir} \ + -DCMAKE_PREFIX_PATH="${zstd_install_dir}/lib/cmake/zstd;${protobuf_install_dir}"\ + -DMGARD_ENABLE_SERIAL=OFF\ + -DMGARD_ENABLE_SYCL=ON\ + -DCMAKE_CXX_COMPILER=icpx\ + -DCMAKE_C_COMPILER=icx\ + -DCMAKE_CXX_FLAGS="-fsycl -fsycl-targets=x86_64"\ + -DMGARD_ENABLE_DOCS=OFF\ + -DCMAKE_BUILD_TYPE=Release\ + -DCMAKE_INSTALL_PREFIX=${mgard_x_install_dir} +cmake --build ${mgard_x_build_dir} -j ${num_build_procs} +cmake --install ${mgard_x_build_dir} diff --git a/build_scripts/build_mgard_sycl_xehp.sh b/build_scripts/build_mgard_sycl_xehp.sh new file mode 100755 index 0000000000..5e23c299ed --- /dev/null +++ b/build_scripts/build_mgard_sycl_xehp.sh @@ -0,0 +1,74 @@ +#!/bin/sh + +# Copyright 2021, Oak Ridge National Laboratory. +# MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs +# Author: Jieyang Chen (chenj3@ornl.gov) +# Date: April 2, 2021 +# Script for building MGARD-X + +set -e +set -x + +######## User Configurations ######## +# Source directory +mgard_x_src_dir=. +# Build directory +build_dir=./build-sycl-xehp +# Number of processors used for building +num_build_procs=8 +# Installtaion directory +install_dir=./install-sycl-xehp + + +#build ZSTD +zstd_dir=${build_dir}/zstd +zstd_src_dir=${zstd_dir}/src +zstd_build_dir=${zstd_dir}/build +zstd_install_dir=${install_dir} +if [ ! -d "${zstd_src_dir}" ]; then + git clone -b v1.5.0 https://github.com/facebook/zstd.git ${zstd_src_dir} +fi +mkdir -p ${zstd_build_dir} +cmake -S ${zstd_src_dir}/build/cmake -B ${zstd_build_dir}\ + -DZSTD_MULTITHREAD_SUPPORT=ON\ + -DCMAKE_INSTALL_LIBDIR=lib\ + -DCMAKE_CXX_COMPILER=icpx\ + -DCMAKE_C_COMPILER=icx\ + -DCMAKE_INSTALL_PREFIX=${zstd_install_dir} +cmake --build ${zstd_build_dir} -j ${num_build_procs} +cmake --install ${zstd_build_dir} + +#build Protobuf +protobuf_dir=${build_dir}/protobuf +protobuf_src_dir=${protobuf_dir}/src +protobuf_build_dir=${protobuf_dir}/build +protobuf_install_dir=${install_dir} +if [ ! -d "${protobuf_src_dir}" ]; then + git clone -b v3.19.4 --recurse-submodules https://github.com/protocolbuffers/protobuf.git ${protobuf_src_dir} +fi +mkdir -p ${protobuf_build_dir} +cmake -S ${protobuf_src_dir}/cmake -B ${protobuf_build_dir}\ + -Dprotobuf_BUILD_SHARED_LIBS=ON\ + -DCMAKE_CXX_COMPILER=icpx\ + -DCMAKE_C_COMPILER=icx\ + -DCMAKE_INSTALL_PREFIX=${protobuf_install_dir} +cmake --build ${protobuf_build_dir} -j ${num_build_procs} +cmake --install ${protobuf_build_dir} + + +#build MGARD +mgard_x_build_dir=${build_dir}/mgard +mgard_x_install_dir=${install_dir} +mkdir -p ${mgard_x_build_dir} +cmake -S ${mgard_x_src_dir} -B ${mgard_x_build_dir} \ + -DCMAKE_PREFIX_PATH="${zstd_install_dir}/lib/cmake/zstd;${protobuf_install_dir}"\ + -DMGARD_ENABLE_SERIAL=OFF\ + -DMGARD_ENABLE_SYCL=ON\ + -DCMAKE_CXX_COMPILER=icpx\ + -DCMAKE_C_COMPILER=icx\ + -DCMAKE_CXX_FLAGS="-O2 -fsycl -fsycl-targets=spir64_gen -Xsycl-target-backend \"-device xehp\""\ + -DMGARD_ENABLE_DOCS=OFF\ + -DCMAKE_BUILD_TYPE=Release\ + -DCMAKE_INSTALL_PREFIX=${mgard_x_install_dir} +cmake --build ${mgard_x_build_dir} -j ${num_build_procs} +cmake --install ${mgard_x_build_dir} diff --git a/cmake/MgardXGenerateSource.cmake b/cmake/MgardXGenerateSource.cmake new file mode 100644 index 0000000000..941d152102 --- /dev/null +++ b/cmake/MgardXGenerateSource.cmake @@ -0,0 +1,76 @@ +if (MGARD_ENABLE_SERIAL) + list(APPEND DEVICE_TYPE_LIST SERIAL) +endif() +if (MGARD_ENABLE_CUDA) + list(APPEND DEVICE_TYPE_LIST CUDA) +endif() +if (MGARD_ENABLE_HIP) + list(APPEND DEVICE_TYPE_LIST HIP) +endif() +if (MGARD_ENABLE_SYCL) + list(APPEND DEVICE_TYPE_LIST SYCL) +endif() + +set(DATA_TYPE_LIST double float) +set(NUM_DIM_LIST 1 2 3 4 5) + +function(MgardXGenerateSourceAllCombinations src_file_prefix) + foreach(DEVICE_TYPE IN LISTS DEVICE_TYPE_LIST) + foreach(DATA_TYPE IN LISTS DATA_TYPE_LIST) + foreach(NUM_DIM IN LISTS NUM_DIM_LIST) + set(SRC_FILE "${CMAKE_CURRENT_SOURCE_DIR}/${src_file_prefix}.cpp.in") + set(GEN_FILE "${CMAKE_CURRENT_BINARY_DIR}/${src_file_prefix}_${NUM_DIM}D_${DATA_TYPE}_${DEVICE_TYPE}.cpp") + configure_file(${SRC_FILE} ${GEN_FILE}) + if (${DEVICE_TYPE} STREQUAL "SERIAL") + set_source_files_properties(${GEN_FILE} PROPERTIES LANGUAGE CXX) + list(APPEND MGARD_X_SERIAL_SRC ${GEN_FILE}) + endif() + if (${DEVICE_TYPE} STREQUAL "CUDA") + set_source_files_properties(${GEN_FILE} PROPERTIES LANGUAGE CUDA) + list(APPEND MGARD_X_CUDA_SRC ${GEN_FILE}) + endif() + if (${DEVICE_TYPE} STREQUAL "HIP") + set_source_files_properties(${GEN_FILE} PROPERTIES LANGUAGE HIP) + list(APPEND MGARD_X_HIP_SRC ${GEN_FILE}) + endif() + if (${DEVICE_TYPE} STREQUAL "SYCL") + set_source_files_properties(${GEN_FILE} PROPERTIES LANGUAGE CXX) + list(APPEND MGARD_X_SYCL_SRC ${GEN_FILE}) + endif() + + endforeach() + endforeach() + endforeach() + set(MGARD_X_SERIAL_SRC ${MGARD_X_SERIAL_SRC} PARENT_SCOPE) + set(MGARD_X_CUDA_SRC ${MGARD_X_CUDA_SRC} PARENT_SCOPE) + set(MGARD_X_HIP_SRC ${MGARD_X_HIP_SRC} PARENT_SCOPE) + set(MGARD_X_SYCL_SRC ${MGARD_X_SYCL_SRC} PARENT_SCOPE) +endfunction() + +function(MgardXGenerateSourceAllDevices src_file_prefix) + foreach(DEVICE_TYPE IN LISTS DEVICE_TYPE_LIST) + set(SRC_FILE "${CMAKE_CURRENT_SOURCE_DIR}/${src_file_prefix}.cpp.in") + set(GEN_FILE "${CMAKE_CURRENT_BINARY_DIR}/${src_file_prefix}_${DEVICE_TYPE}.cpp") + configure_file(${SRC_FILE} ${GEN_FILE}) + if (${DEVICE_TYPE} STREQUAL "SERIAL") + set_source_files_properties(${GEN_FILE} PROPERTIES LANGUAGE CXX) + list(APPEND MGARD_X_SERIAL_SRC ${GEN_FILE}) + endif() + if (${DEVICE_TYPE} STREQUAL "CUDA") + set_source_files_properties(${GEN_FILE} PROPERTIES LANGUAGE CUDA) + list(APPEND MGARD_X_CUDA_SRC ${GEN_FILE}) + endif() + if (${DEVICE_TYPE} STREQUAL "HIP") + set_source_files_properties(${GEN_FILE} PROPERTIES LANGUAGE HIP) + list(APPEND MGARD_X_HIP_SRC ${GEN_FILE}) + endif() + if (${DEVICE_TYPE} STREQUAL "SYCL") + set_source_files_properties(${GEN_FILE} PROPERTIES LANGUAGE CXX) + list(APPEND MGARD_X_SYCL_SRC ${GEN_FILE}) + endif() + endforeach() + set(MGARD_X_SERIAL_SRC ${MGARD_X_SERIAL_SRC} PARENT_SCOPE) + set(MGARD_X_CUDA_SRC ${MGARD_X_CUDA_SRC} PARENT_SCOPE) + set(MGARD_X_HIP_SRC ${MGARD_X_HIP_SRC} PARENT_SCOPE) + set(MGARD_X_SYCL_SRC ${MGARD_X_SYCL_SRC} PARENT_SCOPE) +endfunction() \ No newline at end of file diff --git a/examples/mgard-x/BatchTests/BatchTests.cpp b/examples/mgard-x/BatchTests/BatchTests.cpp index 705c051d80..8dfd7542fa 100644 --- a/examples/mgard-x/BatchTests/BatchTests.cpp +++ b/examples/mgard-x/BatchTests/BatchTests.cpp @@ -146,7 +146,7 @@ void compression(std::vector shape, enum device dev, T tol, T s, enum mgard_x::device_type dev_type; if (dev == X_Serial) { - dev_type = mgard_x::device_type::Serial; + dev_type = mgard_x::device_type::SERIAL; } else if (dev == X_CUDA) { dev_type = mgard_x::device_type::CUDA; } else if (dev == X_HIP) { @@ -211,7 +211,7 @@ void decompression(std::vector shape, enum device dev, T tol, enum mgard_x::device_type dev_type; if (dev == X_Serial) { - dev_type = mgard_x::device_type::Serial; + dev_type = mgard_x::device_type::SERIAL; } else if (dev == X_CUDA) { dev_type = mgard_x::device_type::CUDA; } else if (dev == X_HIP) { @@ -349,13 +349,13 @@ int main(int argc, char *argv[]) { dev2 = argv[i++]; enum device device_type1, device_type2; - enum mgard_x::device_type dev_type = mgard_x::device_type::None; + enum mgard_x::device_type dev_type = mgard_x::device_type::NONE; std::cout << "Device1: "; if (strcmp(dev1, "x-serial") == 0) { - dev_type = mgard_x::device_type::Serial; + dev_type = mgard_x::device_type::SERIAL; device_type1 = device::X_Serial; - std::cout << "MGARD-X::Serial\n"; + std::cout << "MGARD-X::SERIAL\n"; } else if (strcmp(dev1, "x-cuda") == 0) { dev_type = mgard_x::device_type::CUDA; device_type1 = device::X_CUDA; @@ -378,9 +378,9 @@ int main(int argc, char *argv[]) { std::cout << "Device2: "; if (strcmp(dev2, "x-serial") == 0) { - dev_type = mgard_x::device_type::Serial; + dev_type = mgard_x::device_type::SERIAL; device_type2 = device::X_Serial; - std::cout << "MGARD-X::Serial\n"; + std::cout << "MGARD-X::SERIAL\n"; } else if (strcmp(dev2, "x-cuda") == 0) { dev_type = mgard_x::device_type::CUDA; device_type2 = device::X_CUDA; diff --git a/examples/mgard-x/HighLevelAPIs/Example.cpp b/examples/mgard-x/HighLevelAPIs/Example.cpp index 1c608d93b4..94b5aeda3e 100644 --- a/examples/mgard-x/HighLevelAPIs/Example.cpp +++ b/examples/mgard-x/HighLevelAPIs/Example.cpp @@ -29,10 +29,10 @@ int main() { std::cout << "Done\n"; std::cout - << "Decompressing with MGARD-X High level API with Serial backend..."; + << "Decompressing with MGARD-X High level API with SERIAL backend..."; // decompression void *decompressed_array_cpu = NULL; - config.dev_type = mgard_x::device_type::Serial; + config.dev_type = mgard_x::device_type::SERIAL; mgard_x::decompress(compressed_array_cpu, compressed_size, decompressed_array_cpu, config, false); diff --git a/examples/mgard-x/LowLevelAPIs/README.md b/examples/mgard-x/LowLevelAPIs/README.md index 858ecced3c..2b8c445f0f 100644 --- a/examples/mgard-x/LowLevelAPIs/README.md +++ b/examples/mgard-x/LowLevelAPIs/README.md @@ -1,7 +1,7 @@ # Compressing with MGARD-X Low-level APIs First, build and install MGARD-X. -Then, run the following in `examples/mgard-x/LowLevelAPIs/Serial`, `examples/mgard-x/LowLevelAPIs/CUDA`, `examples/mgard-x/LowLevelAPIs/HIP`. Each folder contains a CMake project dedicated for a different kind of processor. +Then, run the following in `examples/mgard-x/LowLevelAPIs/SERIAL`, `examples/mgard-x/LowLevelAPIs/CUDA`, `examples/mgard-x/LowLevelAPIs/HIP`. Each folder contains a CMake project dedicated for a different kind of processor. Build with CMake as follows or use the 'build_scripts.sh'. ```console diff --git a/examples/mgard-x/LowLevelAPIs/Serial/CMakeLists.txt b/examples/mgard-x/LowLevelAPIs/SERIAL/CMakeLists.txt similarity index 100% rename from examples/mgard-x/LowLevelAPIs/Serial/CMakeLists.txt rename to examples/mgard-x/LowLevelAPIs/SERIAL/CMakeLists.txt diff --git a/examples/mgard-x/LowLevelAPIs/Serial/Example.cpp b/examples/mgard-x/LowLevelAPIs/SERIAL/Example.cpp similarity index 74% rename from examples/mgard-x/LowLevelAPIs/Serial/Example.cpp rename to examples/mgard-x/LowLevelAPIs/SERIAL/Example.cpp index d4b6fcaff5..1c2f22c015 100644 --- a/examples/mgard-x/LowLevelAPIs/Serial/Example.cpp +++ b/examples/mgard-x/LowLevelAPIs/SERIAL/Example.cpp @@ -13,16 +13,16 @@ int main() { double *in_array_cpu = new double[n1 * n2 * n3]; //... load data into in_array_cpu std::vector shape{n1, n2, n3}; - mgard_x::Hierarchy<3, double, mgard_x::Serial> hierarchy(shape); - mgard_x::Array<3, double, mgard_x::Serial> in_array(shape); + mgard_x::Hierarchy<3, double, mgard_x::SERIAL> hierarchy(shape); + mgard_x::Array<3, double, mgard_x::SERIAL> in_array(shape); in_array.load(in_array_cpu); std::cout << "Done\n"; - std::cout << "Compressing with MGARD-X Serial backend..."; + std::cout << "Compressing with MGARD-X SERIAL backend..."; double tol = 0.01, s = 0, norm; mgard_x::Config config; config.lossless = mgard_x::lossless_type::Huffman_Zstd; - mgard_x::Array<1, unsigned char, mgard_x::Serial> compressed_array = + mgard_x::Array<1, unsigned char, mgard_x::SERIAL> compressed_array = mgard_x::compress(hierarchy, in_array, mgard_x::error_bound_type::REL, tol, s, norm, config); // Get compressed size in number of bytes. @@ -30,9 +30,9 @@ int main() { unsigned char *compressed_array_cpu = compressed_array.hostCopy(); std::cout << "Done\n"; - std::cout << "Decompressing with MGARD-X Serial backend..."; + std::cout << "Decompressing with MGARD-X SERIAL backend..."; // decompression - mgard_x::Array<3, double, mgard_x::Serial> decompressed_array = + mgard_x::Array<3, double, mgard_x::SERIAL> decompressed_array = mgard_x::decompress(hierarchy, compressed_array, mgard_x::error_bound_type::REL, tol, s, norm, config); delete[] in_array_cpu; diff --git a/examples/mgard-x/LowLevelAPIs/Serial/build_script.sh b/examples/mgard-x/LowLevelAPIs/SERIAL/build_script.sh similarity index 100% rename from examples/mgard-x/LowLevelAPIs/Serial/build_script.sh rename to examples/mgard-x/LowLevelAPIs/SERIAL/build_script.sh diff --git a/examples/mgard-x/MDR-X/README.md b/examples/mgard-x/MDR-X/README.md index 0e9c917f60..c8143f98ff 100644 --- a/examples/mgard-x/MDR-X/README.md +++ b/examples/mgard-x/MDR-X/README.md @@ -1,7 +1,7 @@ # Refactor and progressively reconstruct data with MDR-X First, build and install MGARD-X. -Then, run the following in `examples/mgard-x/MDR-X/Serial`, `examples/mgard-x/MDR-X/CUDA`, `examples/mgard-x/MDR-X/HIP`. Each folder contains a CMake project dedicated for a different kind of processor. +Then, run the following in `examples/mgard-x/MDR-X/SERIAL`, `examples/mgard-x/MDR-X/CUDA`, `examples/mgard-x/MDR-X/HIP`. Each folder contains a CMake project dedicated for a different kind of processor. Build with CMake as follows or use the 'build_scripts.sh'. ```console diff --git a/examples/mgard-x/MDR-X/Serial/CMakeLists.txt b/examples/mgard-x/MDR-X/SERIAL/CMakeLists.txt similarity index 100% rename from examples/mgard-x/MDR-X/Serial/CMakeLists.txt rename to examples/mgard-x/MDR-X/SERIAL/CMakeLists.txt diff --git a/examples/mgard-x/MDR-X/Serial/build_script.sh b/examples/mgard-x/MDR-X/SERIAL/build_script.sh similarity index 100% rename from examples/mgard-x/MDR-X/Serial/build_script.sh rename to examples/mgard-x/MDR-X/SERIAL/build_script.sh diff --git a/examples/mgard-x/MDR-X/Serial/reconstructor.cpp b/examples/mgard-x/MDR-X/SERIAL/reconstructor.cpp similarity index 99% rename from examples/mgard-x/MDR-X/Serial/reconstructor.cpp rename to examples/mgard-x/MDR-X/SERIAL/reconstructor.cpp index fcee4c992d..d555505709 100644 --- a/examples/mgard-x/MDR-X/Serial/reconstructor.cpp +++ b/examples/mgard-x/MDR-X/SERIAL/reconstructor.cpp @@ -172,7 +172,7 @@ int main(int argc, char **argv) { using T = float; using T_stream = uint32_t; using T_error = double; - using DeviceType = mgard_x::Serial; + using DeviceType = mgard_x::SERIAL; const mgard_x::DIM D = 3; mgard_x::Hierarchy hierarchy(dims, 0, num_levels - 1); diff --git a/examples/mgard-x/MDR-X/Serial/refactor.cpp b/examples/mgard-x/MDR-X/SERIAL/refactor.cpp similarity index 99% rename from examples/mgard-x/MDR-X/Serial/refactor.cpp rename to examples/mgard-x/MDR-X/SERIAL/refactor.cpp index 9516d61607..6b46849a92 100644 --- a/examples/mgard-x/MDR-X/Serial/refactor.cpp +++ b/examples/mgard-x/MDR-X/SERIAL/refactor.cpp @@ -79,7 +79,7 @@ int main(int argc, char **argv) { using T = float; using T_stream = uint32_t; using T_error = double; - using DeviceType = mgard_x::Serial; + using DeviceType = mgard_x::SERIAL; if (num_bitplanes > 32) { num_bitplanes = 32; std::cout << "Only less than 32 bitplanes are supported for " diff --git a/include/MGARDXConfig.h.in b/include/MGARDXConfig.h.in index 86b13f586f..a403b002dc 100644 --- a/include/MGARDXConfig.h.in +++ b/include/MGARDXConfig.h.in @@ -26,16 +26,16 @@ #define MGARD_ENABLE_HIP 0 #endif +#if '@MGARD_ENABLE_SYCL@' == 'ON' +#define MGARD_ENABLE_SYCL 1 +#else +#define MGARD_ENABLE_SYCL 0 +#endif + #if '@MGARD_ENABLE_LEGACY_CUDA@' == 'ON' #define MGARD_ENABLE_LEGACY_CUDA 1 #else #define MGARD_ENABLE_LEGACY_CUDA 0 #endif -// #if '@MGARD_ENABLE_SYCL@' == 'ON' -// #define MGARD_ENABLE_SYCL 1 -// #else -// #define MGARD_ENABLE_SYCL 0 -// #endif - #endif diff --git a/include/compress_x.hpp b/include/compress_x.hpp index 1b4e9fae2d..1d2189cbb6 100644 --- a/include/compress_x.hpp +++ b/include/compress_x.hpp @@ -5,9 +5,9 @@ * Date: March 17, 2022 */ -#include "mgard-x/CompressionWorkflow.h" +#include "MGARDXConfig.h" #include "mgard-x/RuntimeX/RuntimeXPublic.h" -#include "mgard-x/Types.h" +#include "mgard-x/Utilities/Types.h" #include #ifndef MGARD_X_API_H diff --git a/include/compress_x_lowlevel.hpp b/include/compress_x_lowlevel.hpp index 57ec7cd214..6b9507e3c0 100644 --- a/include/compress_x_lowlevel.hpp +++ b/include/compress_x_lowlevel.hpp @@ -5,8 +5,8 @@ * Date: March 17, 2022 */ -#include "mgard-x/CompressionWorkflow.hpp" -#include "mgard-x/Hierarchy.hpp" +#include "mgard-x/CompressionLowLevel/CompressionLowLevel.hpp" +#include "mgard-x/Hierarchy/Hierarchy.hpp" #include "mgard-x/RuntimeX/DataStructures/Array.hpp" #include "mgard-x/RuntimeX/RuntimeX.h" diff --git a/include/mgard-x/CMakeLists.txt b/include/mgard-x/CMakeLists.txt index 753cc304d3..df47eb4dcc 100644 --- a/include/mgard-x/CMakeLists.txt +++ b/include/mgard-x/CMakeLists.txt @@ -1,16 +1,10 @@ add_subdirectory(DataRefactoring) +add_subdirectory(CompressionLowLevel) +add_subdirectory(CompressionHighLevel) +add_subdirectory(Hierarchy) add_subdirectory(Lossless) add_subdirectory(Quantization) add_subdirectory(RuntimeX) -list(APPEND MGARD_X_HEADER - ${CMAKE_CURRENT_SOURCE_DIR}/CompressionWorkflow.h - ${CMAKE_CURRENT_SOURCE_DIR}/CompressionWorkflow.hpp - ${CMAKE_CURRENT_SOURCE_DIR}/Hierarchy.h - ${CMAKE_CURRENT_SOURCE_DIR}/Hierarchy.hpp - ${CMAKE_CURRENT_SOURCE_DIR}/HighLevelAPI.h - ${CMAKE_CURRENT_SOURCE_DIR}/HighLevelAPI.hpp - ${CMAKE_CURRENT_SOURCE_DIR}/Metadata.hpp - ${CMAKE_CURRENT_SOURCE_DIR}/Types.h - ) + set(MGARD_X_HEADER ${MGARD_X_HEADER} PARENT_SCOPE) diff --git a/include/mgard-x/CompressionHighLevel/CMakeLists.txt b/include/mgard-x/CompressionHighLevel/CMakeLists.txt new file mode 100644 index 0000000000..58c83c1311 --- /dev/null +++ b/include/mgard-x/CompressionHighLevel/CMakeLists.txt @@ -0,0 +1,6 @@ +list(APPEND MGARD_X_HEADER + ${CMAKE_CURRENT_SOURCE_DIR}/CompressionHighLevel.h + ${CMAKE_CURRENT_SOURCE_DIR}/CompressionHighLevel.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/Metadata.hpp + ) +set(MGARD_X_HEADER ${MGARD_X_HEADER} PARENT_SCOPE) \ No newline at end of file diff --git a/include/mgard-x/HighLevelAPI.h b/include/mgard-x/CompressionHighLevel/CompressionHighLevel.h similarity index 88% rename from include/mgard-x/HighLevelAPI.h rename to include/mgard-x/CompressionHighLevel/CompressionHighLevel.h index 1082e76c4f..c56c283900 100644 --- a/include/mgard-x/HighLevelAPI.h +++ b/include/mgard-x/CompressionHighLevel/CompressionHighLevel.h @@ -11,13 +11,12 @@ #include #include -// #include "compress_cuda.hpp" -#include "mgard-x/Hierarchy.h" -#include "mgard-x/Metadata.hpp" -#include "mgard-x/RuntimeX/RuntimeXPublic.h" +#include "../Hierarchy/Hierarchy.h" +#include "../RuntimeX/RuntimeXPublic.h" +#include "Metadata.hpp" -#ifndef MGARD_X_HIGH_LEVEL_API_H -#define MGARD_X_HIGH_LEVEL_API_H +#ifndef MGARD_X_COMPRESSION_HIGH_LEVEL_API_H +#define MGARD_X_COMPRESSION_HIGH_LEVEL_API_H namespace mgard_x { @@ -66,10 +65,6 @@ void decompress(const void *compressed_data, size_t compressed_size, void *&decompressed_data, data_type &dtype, std::vector &shape, bool output_pre_allocated); -template void BeginAutoTuning(); - -template void EndAutoTuning(); - } // namespace mgard_x #endif \ No newline at end of file diff --git a/include/mgard-x/HighLevelAPI.hpp b/include/mgard-x/CompressionHighLevel/CompressionHighLevel.hpp similarity index 99% rename from include/mgard-x/HighLevelAPI.hpp rename to include/mgard-x/CompressionHighLevel/CompressionHighLevel.hpp index 4369b04773..d88d2e3d8d 100644 --- a/include/mgard-x/HighLevelAPI.hpp +++ b/include/mgard-x/CompressionHighLevel/CompressionHighLevel.hpp @@ -11,15 +11,15 @@ #include #include +#include "../Hierarchy/Hierarchy.hpp" +#include "../RuntimeX/RuntimeX.h" +#include "Metadata.hpp" #include "compress_x.hpp" -#include "mgard-x/Hierarchy.hpp" -#include "mgard-x/Metadata.hpp" -#include "mgard-x/RuntimeX/RuntimeX.h" -#include "Utilities/CheckEndianess.h" +#include "../CompressionLowLevel/CompressionLowLevel.h" -#ifndef MGARD_X_HIGH_LEVEL_API_HPP -#define MGARD_X_HIGH_LEVEL_API_HPP +#ifndef MGARD_X_COMPRESSION_HIGH_LEVEL_API_HPP +#define MGARD_X_COMPRESSION_HIGH_LEVEL_API_HPP namespace mgard_x { @@ -292,12 +292,14 @@ void compress(std::vector shape, T tol, T s, enum error_bound_type type, Hierarchy hierarchy(shape, config.uniform_coord_mode); Metadata m; - if (std::is_same::value) { - m.ptype = processor_type::X_Serial; + if (std::is_same::value) { + m.ptype = processor_type::X_SERIAL; } else if (std::is_same::value) { m.ptype = processor_type::X_CUDA; } else if (std::is_same::value) { m.ptype = processor_type::X_HIP; + } else if (std::is_same::value) { + m.ptype = processor_type::X_SYCL; } m.ebtype = type; m.tol = tol; @@ -325,7 +327,6 @@ void compress(std::vector shape, T tol, T s, enum error_bound_type type, #endif m.dtype = std::is_same::value ? data_type::Double : data_type::Float; - m.etype = CheckEndianess(); m.dstype = data_structure_type::Cartesian_Grid_Uniform; m.total_dims = D; m.shape = std::vector(D); @@ -521,12 +522,14 @@ void compress(std::vector shape, T tol, T s, enum error_bound_type type, Hierarchy hierarchy(shape, coords); Metadata m; - if (std::is_same::value) { - m.ptype = processor_type::X_Serial; + if (std::is_same::value) { + m.ptype = processor_type::X_SERIAL; } else if (std::is_same::value) { m.ptype = processor_type::X_CUDA; } else if (std::is_same::value) { m.ptype = processor_type::X_HIP; + } else if (std::is_same::value) { + m.ptype = processor_type::X_SYCL; } m.ebtype = type; m.tol = tol; @@ -554,7 +557,6 @@ void compress(std::vector shape, T tol, T s, enum error_bound_type type, #endif m.dtype = std::is_same::value ? data_type::Double : data_type::Float; - m.etype = CheckEndianess(); m.dstype = data_structure_type::Cartesian_Grid_Non_Uniform; m.total_dims = D; m.shape = std::vector(D); @@ -1604,14 +1606,6 @@ void decompress(const void *compressed_data, size_t compressed_size, dtype, shape, config, output_pre_allocated); } -template void BeginAutoTuning() { - AutoTuner::ProfileKernels = true; -} - -template void EndAutoTuning() { - AutoTuner::ProfileKernels = false; -} - } // namespace mgard_x #endif \ No newline at end of file diff --git a/include/mgard-x/Metadata.hpp b/include/mgard-x/CompressionHighLevel/Metadata.hpp similarity index 98% rename from include/mgard-x/Metadata.hpp rename to include/mgard-x/CompressionHighLevel/Metadata.hpp index b7a61c3c1e..a0111629ce 100644 --- a/include/mgard-x/Metadata.hpp +++ b/include/mgard-x/CompressionHighLevel/Metadata.hpp @@ -5,9 +5,9 @@ * Date: March 17, 2022 */ +#include "../RuntimeX/RuntimeX.h" +#include "../Utilities/Types.h" #include "MGARDConfig.hpp" -#include "RuntimeX/RuntimeX.h" -#include "Types.h" #include "format.hpp" #include "proto/mgard.pb.h" #include @@ -160,8 +160,8 @@ template struct Metadata { } std::cout << "Backend: "; - if (ptype == processor_type::X_Serial) { - std::cout << "X_Serial\n"; + if (ptype == processor_type::X_SERIAL) { + std::cout << "X_SERIAL\n"; } else if (ptype == processor_type::X_CUDA) { std::cout << "X_CUDA\n"; } else if (ptype == processor_type::X_HIP) { @@ -173,7 +173,11 @@ template struct Metadata { private: SERIALIZED_TYPE *SerializeAll(uint32_t &total_size) { - + if (big_endian()) { + etype = endiness_type::Big_Endian; + } else { + etype = endiness_type::Little_Endian; + } total_size = 0; // about MGARD software @@ -500,7 +504,7 @@ template struct Metadata { { // Device mgard::pb::Device &device = *header.mutable_device(); - if (ptype == processor_type::X_Serial) { + if (ptype == processor_type::X_SERIAL) { device.set_backend(mgard::pb::Device::X_SERIAL); } else if (ptype == processor_type::X_CUDA) { device.set_backend(mgard::pb::Device::X_CUDA); @@ -746,7 +750,7 @@ template struct Metadata { { // Device const mgard::pb::Device device = header.device(); if (device.backend() == mgard::pb::Device::X_SERIAL) { - ptype = processor_type::X_Serial; + ptype = processor_type::X_SERIAL; } else if (device.backend() == mgard::pb::Device::X_CUDA) { ptype = processor_type::X_CUDA; } else if (device.backend() == mgard::pb::Device::X_HIP) { diff --git a/include/mgard-x/CompressionLowLevel/CMakeLists.txt b/include/mgard-x/CompressionLowLevel/CMakeLists.txt new file mode 100644 index 0000000000..7dcac948d7 --- /dev/null +++ b/include/mgard-x/CompressionLowLevel/CMakeLists.txt @@ -0,0 +1,5 @@ +list(APPEND MGARD_X_HEADER + ${CMAKE_CURRENT_SOURCE_DIR}/CompressionLowLevel.h + ${CMAKE_CURRENT_SOURCE_DIR}/CompressionLowLevel.hpp + ) +set(MGARD_X_HEADER ${MGARD_X_HEADER} PARENT_SCOPE) \ No newline at end of file diff --git a/include/mgard-x/CompressionWorkflow.h b/include/mgard-x/CompressionLowLevel/CompressionLowLevel.h similarity index 83% rename from include/mgard-x/CompressionWorkflow.h rename to include/mgard-x/CompressionLowLevel/CompressionLowLevel.h index 0ee4e5e342..a45c89b551 100644 --- a/include/mgard-x/CompressionWorkflow.h +++ b/include/mgard-x/CompressionLowLevel/CompressionLowLevel.h @@ -5,11 +5,11 @@ * Date: March 17, 2022 */ -#ifndef MGARD_X_COMPRESSION_WORKFLOW_H -#define MGARD_X_COMPRESSION_WORKFLOW_H +#ifndef MGARD_X_COMPRESSION_LOW_LEVEL_H +#define MGARD_X_COMPRESSION_LOW_LEVEL_H -#include "Hierarchy.h" -#include "RuntimeX/RuntimeXPublic.h" +#include "../Hierarchy/Hierarchy.hpp" +#include "../RuntimeX/RuntimeXPublic.h" namespace mgard_x { diff --git a/include/mgard-x/CompressionWorkflow.hpp b/include/mgard-x/CompressionLowLevel/CompressionLowLevel.hpp similarity index 91% rename from include/mgard-x/CompressionWorkflow.hpp rename to include/mgard-x/CompressionLowLevel/CompressionLowLevel.hpp index edc3ffa8a0..47062970f7 100644 --- a/include/mgard-x/CompressionWorkflow.hpp +++ b/include/mgard-x/CompressionLowLevel/CompressionLowLevel.hpp @@ -11,38 +11,37 @@ #include #include -#include "Types.h" +#include "../Utilities/Types.h" -#include "CompressionWorkflow.h" -#include "Hierarchy.hpp" -#include "RuntimeX/RuntimeX.h" +#include "../Hierarchy/Hierarchy.hpp" +#include "../RuntimeX/RuntimeX.h" +#include "CompressionLowLevel.h" -#include "DataRefactoring/MultiDimension/DataRefactoring.h" -#include "DataRefactoring/SingleDimension/DataRefactoring.h" +#include "../DataRefactoring/MultiDimension/DataRefactoring.h" +#include "../DataRefactoring/SingleDimension/DataRefactoring.h" -#include "Quantization/LinearQuantization.hpp" +#include "../Quantization/LinearQuantization.hpp" // #include "Linearization/LevelLinearizer.hpp" -#include "Linearization/LevelLinearizer2.hpp" +#include "../Linearization/LevelLinearizer2.hpp" -#include "Lossless/ParallelHuffman/Huffman.hpp" +#include "../Lossless/ParallelHuffman/Huffman.hpp" #ifdef MGARDX_COMPILE_CUDA -#include "Lossless/Cascaded.hpp" -#include "Lossless/LZ4.hpp" +#include "../Lossless/Cascaded.hpp" +#include "../Lossless/LZ4.hpp" #endif -#include "Lossless/CPU.hpp" -#include "Lossless/Zstd.hpp" -#include "Utilities/CheckEndianess.h" +#include "../Lossless/CPU.hpp" +#include "../Lossless/Zstd.hpp" // for debugging // #include "../cuda/CommonInternal.h" // #include "../cuda/DataRefactoring.h" // #include "../cuda/SubArray.h" -#ifndef MGARD_X_COMPRESSION_WORKFLOW_HPP -#define MGARD_X_COMPRESSION_WORKFLOW_HPP +#ifndef MGARD_X_COMPRESSION_LOW_LEVEL_HPP +#define MGARD_X_COMPRESSION_LOW_LEVEL_HPP #define BLOCK_SIZE 64 @@ -58,6 +57,11 @@ compress(Hierarchy &hierarchy, Array &in_array, enum error_bound_type type, T tol, T s, T &norm, Config config) { DeviceRuntime::SelectDevice(config.dev_id); + if (config.timing) { + std::cout << log::log_info + << "Select device: " << DeviceRuntime::GetDeviceName() + << "\n"; + } Timer timer_total, timer_each; for (DIM i = 0; i < D; i++) { if (hierarchy.shape[i] != in_array.shape()[i]) { @@ -193,6 +197,8 @@ compress(Hierarchy &hierarchy, SubArray<1, LENGTH, DeviceType> outlier_idx_subarray(outlier_idx_array); SubArray<1, QUANTIZED_INT, DeviceType> outliers_subarray(outliers_array); + DeviceRuntime::SyncQueue(0); + LevelwiseLinearQuantizeND().Execute( SubArray<1, SIZE, DeviceType>(hierarchy.ranges), hierarchy.l_target, quantizers_subarray, SubArray<2, T, DeviceType>(hierarchy.volumes_array), @@ -201,11 +207,12 @@ compress(Hierarchy &hierarchy, SubArray<1, SIZE, DeviceType>(hierarchy.shapes[0], true), SubArray<1, LENGTH, DeviceType>(outlier_count_array), outlier_idx_subarray, outliers_subarray, 0); + MemoryManager::Copy1D(&outlier_count, outlier_count_array.data(), 1, 0); - DeviceRuntime::SyncDevice(); - // m.huff_outlier_count = outlier_count; + if (config.timing) { + DeviceRuntime::SyncQueue(0); timer_each.end(); timer_each.print("Quantization"); timer_each.clear(); @@ -213,15 +220,12 @@ compress(Hierarchy &hierarchy, << total_elems << " (" << (double)100 * outlier_count / total_elems << "%)\n"; } - if (debug_print) { - // PrintSubarray("decomposed", SubArray(in_array)); - // PrintSubarray("signed_quanzited_array", SubArray(signed_quanzited_array)); std::cout << "outlier_count: " << - // outlier_count << std::endl; PrintSubarray("quantized outliers_array", - // SubArray<1, QUANTIZED_INT, DeviceType>(outliers_array)); - // PrintSubarray("quantized outlier_idx_array", SubArray<1, LENGTH, - // DeviceType>(outlier_idx_array)); - } + // if (debug_print) { + // PrintSubarray("decomposed", SubArray(in_array)); + // PrintSubarray("quantized_subarray", quantized_subarray); + // PrintSubarray("quantized outliers_array", outliers_subarray); + // PrintSubarray("quantized outlier_idx_array", outlier_idx_subarray); + // } Array<1, Byte, DeviceType> lossless_compressed_array; SubArray<1, Byte, DeviceType> lossless_compressed_subarray; @@ -247,6 +251,7 @@ compress(Hierarchy &hierarchy, } DeviceRuntime::SyncDevice(); if (config.timing) { + DeviceRuntime::SyncQueue(0); timer_each.end(); timer_each.print("Level Linearizer type: " + std::to_string(config.reorder)); @@ -278,6 +283,7 @@ compress(Hierarchy &hierarchy, outliers_subarray); lossless_compressed_subarray = SubArray(lossless_compressed_array); if (config.timing) { + DeviceRuntime::SyncQueue(0); timer_each.end(); timer_each.print("Huffman Compress"); std::cout << log::log_info @@ -301,6 +307,7 @@ compress(Hierarchy &hierarchy, CPUCompress(quantized_linearized_subarray); lossless_compressed_subarray = SubArray(lossless_compressed_array); if (config.timing) { + DeviceRuntime::SyncQueue(0); timer_each.end(); timer_each.print("CPU Lossless"); std::cout << log::log_info << "CPU Lossless compress ratio: " @@ -327,6 +334,7 @@ compress(Hierarchy &hierarchy, lossless_compressed_subarray = SubArray(lossless_compressed_array); SIZE lz4_after_size = lossless_compressed_subarray.getShape(0); if (config.timing) { + DeviceRuntime::SyncQueue(0); timer_each.end(); timer_each.print("LZ4 Compress"); std::cout << log::log_info << "LZ4 block size: " << config.lz4_block_size @@ -359,6 +367,7 @@ compress(Hierarchy &hierarchy, } if (config.timing) { + DeviceRuntime::SyncQueue(0); timer_total.end(); timer_total.print("Overall Compress"); std::cout << log::log_time << "Compression Throughput: " @@ -376,6 +385,11 @@ decompress(Hierarchy &hierarchy, Array<1, unsigned char, DeviceType> &compressed_array, enum error_bound_type type, T tol, T s, T norm, Config config) { DeviceRuntime::SelectDevice(config.dev_id); + if (config.timing) { + std::cout << log::log_info + << "Select device: " << DeviceRuntime::GetDeviceName() + << "\n"; + } Timer timer_total, timer_each; SIZE total_elems = @@ -404,8 +418,8 @@ decompress(Hierarchy &hierarchy, lossless_compressed_array = LZ4Decompress(lossless_compressed_subarray); lossless_compressed_subarray = SubArray(lossless_compressed_array); - DeviceRuntime::SyncDevice(); if (config.timing) { + DeviceRuntime::SyncQueue(0); timer_each.end(); timer_each.print("LZ4 Decompress"); timer_each.clear(); @@ -424,8 +438,8 @@ decompress(Hierarchy &hierarchy, lossless_compressed_array = ZstdDecompress(lossless_compressed_subarray); lossless_compressed_subarray = SubArray(lossless_compressed_array); - DeviceRuntime::SyncDevice(); if (config.timing) { + DeviceRuntime::SyncQueue(0); timer_each.end(); timer_each.print("Zstd Decompress"); timer_each.clear(); @@ -470,8 +484,8 @@ decompress(Hierarchy &hierarchy, } else { std::cout << log::log_err << "wrong reodering option.\n"; } - DeviceRuntime::SyncDevice(); if (config.timing) { + DeviceRuntime::SyncQueue(0); timer_each.end(); timer_each.print("Level Linearizer type: " + std::to_string(config.reorder)); @@ -484,8 +498,8 @@ decompress(Hierarchy &hierarchy, total_elems, 0); } - DeviceRuntime::SyncDevice(); if (config.timing) { + DeviceRuntime::SyncQueue(0); timer_each.end(); timer_each.print("Huffman Decompress"); timer_each.clear(); @@ -509,8 +523,8 @@ decompress(Hierarchy &hierarchy, } else { std::cout << log::log_err << "wrong reodering type.\n"; } - DeviceRuntime::SyncDevice(); if (config.timing) { + DeviceRuntime::SyncQueue(0); timer_each.end(); timer_each.print("Level Linearizer type: " + std::to_string(config.reorder)); @@ -524,6 +538,7 @@ decompress(Hierarchy &hierarchy, (QUANTIZED_INT *)quantized_linearized_array.data(), total_elems, 0); } if (config.timing) { + DeviceRuntime::SyncQueue(0); timer_each.end(); timer_each.print("CPU Lossless"); timer_each.clear(); @@ -565,10 +580,8 @@ decompress(Hierarchy &hierarchy, SubArray<1, SIZE, DeviceType>(hierarchy.shapes[0], true), outlier_count, outlier_idx_subarray, outliers_subarray, 0); - DeviceRuntime::SyncDevice(); - - // hierarchy.sync_all(); if (config.timing) { + DeviceRuntime::SyncQueue(0); timer_each.end(); timer_each.print("Dequantization"); timer_each.clear(); @@ -588,15 +601,16 @@ decompress(Hierarchy &hierarchy, recompose_single(hierarchy, decompressed_subarray, hierarchy.l_target, 0); } - // hierarchy.sync_all(); + if (config.timing) { + DeviceRuntime::SyncQueue(0); timer_each.end(); timer_each.print("Recomposition"); timer_each.clear(); } - // hierarchy.sync_all(); if (config.timing) { + DeviceRuntime::SyncQueue(0); timer_total.end(); timer_total.print("Overall Decompression"); std::cout << log::log_time << "Decompression Throughput: " diff --git a/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/CMakeLists.txt b/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/CMakeLists.txt index 5b73c54270..ba5d9ebe59 100644 --- a/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/CMakeLists.txt +++ b/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/CMakeLists.txt @@ -1,8 +1,6 @@ list(APPEND MGARD_X_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/GPKFunctor.h ${CMAKE_CURRENT_SOURCE_DIR}/GridProcessingKernel.hpp - ${CMAKE_CURRENT_SOURCE_DIR}/GridProcessingKernel.h ${CMAKE_CURRENT_SOURCE_DIR}/GridProcessingKernel3D.hpp - ${CMAKE_CURRENT_SOURCE_DIR}/GridProcessingKernel3D.h ) set(MGARD_X_HEADER ${MGARD_X_HEADER} PARENT_SCOPE) \ No newline at end of file diff --git a/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/CalcCoefficients3D.hpp b/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/CalcCoefficients3D.hpp new file mode 100644 index 0000000000..55105682eb --- /dev/null +++ b/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/CalcCoefficients3D.hpp @@ -0,0 +1,93 @@ +/* + * Copyright 2022, Oak Ridge National Laboratory. + * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs + * Author: Jieyang Chen (chenj3@ornl.gov) + * Date: March 17, 2022 + */ + +#include "../../../Hierarchy/Hierarchy.hpp" +#include "../../../RuntimeX/RuntimeX.h" + +#include "../DataRefactoring.h" + +#include "GridProcessingKernel3D.hpp" + +#ifndef MGARD_X_DATA_REFACTORING_CALC_COEFFICIENTS_3D +#define MGARD_X_DATA_REFACTORING_CALC_COEFFICIENTS_3D + +namespace mgard_x { + +template +void CalcCoefficients3D(Hierarchy &hierarchy, + SubArray dinput, + SubArray &doutput, SIZE l, + int queue_idx) { + + int range_l = std::min(6, (int)std::log2(hierarchy.dofs[0][l]) - 1); + int range_lp1 = std::min(6, (int)std::log2(hierarchy.dofs[0][l + 1]) - 1); + + std::string prefix = "decomp_"; + if (sizeof(T) == sizeof(double)) + prefix += "d_"; + if (sizeof(T) == sizeof(float)) + prefix += "f_"; + for (int d = 0; d < D; d++) + prefix += std::to_string(hierarchy.shape[d]) + "_"; + + dinput.project(0, 1, 2); + doutput.project(0, 1, 2); + + SIZE f = hierarchy.dofs[0][l]; + SIZE c = hierarchy.dofs[1][l]; + SIZE r = hierarchy.dofs[2][l]; + SIZE ff = hierarchy.dofs[0][l + 1]; + SIZE cc = hierarchy.dofs[1][l + 1]; + SIZE rr = hierarchy.dofs[2][l + 1]; + + SubArray dcoarse = doutput; + dcoarse.resize({ff, cc, rr}); + SubArray dcoeff_f = doutput; + dcoeff_f.offset({ff, 0, 0}); + dcoeff_f.resize({f - ff, cc, rr}); + SubArray dcoeff_c = doutput; + dcoeff_c.offset({0, cc, 0}); + dcoeff_c.resize({ff, c - cc, rr}); + SubArray dcoeff_r = doutput; + dcoeff_r.offset({0, 0, rr}); + dcoeff_r.resize({ff, cc, r - rr}); + SubArray dcoeff_cf = doutput; + dcoeff_cf.offset({ff, cc, 0}); + dcoeff_cf.resize({f - ff, c - cc, rr}); + SubArray dcoeff_rf = doutput; + dcoeff_rf.offset({ff, 0, rr}); + dcoeff_rf.resize({f - ff, cc, r - rr}); + SubArray dcoeff_rc = doutput; + dcoeff_rc.offset({0, cc, rr}); + dcoeff_rc.resize({ff, c - cc, r - rr}); + SubArray dcoeff_rcf = doutput; + dcoeff_rcf.offset({ff, cc, rr}); + dcoeff_rcf.resize({f - ff, c - cc, r - rr}); + + GpkReo3D().Execute( + hierarchy.dofs[2][l], hierarchy.dofs[1][l], hierarchy.dofs[0][l], + hierarchy.dofs[2][l + 1], hierarchy.dofs[1][l + 1], + hierarchy.dofs[0][l + 1], SubArray(hierarchy.ratio_array[2][l]), + SubArray(hierarchy.ratio_array[1][l]), + SubArray(hierarchy.ratio_array[0][l]), dinput, dcoarse, dcoeff_f, + dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc, dcoeff_rcf, + queue_idx); + + verify_matrix_cuda(hierarchy.dofs[2][l], hierarchy.dofs[1][l], + hierarchy.dofs[0][l], doutput.data(), doutput.getLd(0), + doutput.getLd(1), doutput.getLd(0), + prefix + "gpk_reo_3d" + "_level_" + std::to_string(l), + multidim_refactoring_store, multidim_refactoring_verify); + + if (multidim_refactoring_debug_print) { + PrintSubarray("after pi_Ql_reo", doutput); + } +} + +} // namespace mgard_x + +#endif \ No newline at end of file diff --git a/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/CalcCoefficientsND.hpp b/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/CalcCoefficientsND.hpp new file mode 100644 index 0000000000..6363c83670 --- /dev/null +++ b/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/CalcCoefficientsND.hpp @@ -0,0 +1,222 @@ +/* + * Copyright 2022, Oak Ridge National Laboratory. + * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs + * Author: Jieyang Chen (chenj3@ornl.gov) + * Date: March 17, 2022 + */ + +#include "../../../Hierarchy/Hierarchy.hpp" +#include "../../../RuntimeX/RuntimeX.h" + +#include "../DataRefactoring.h" + +#include "CalcCoefficientsPointers.hpp" +#include "GridProcessingKernel.hpp" + +#ifndef MGARD_X_DATA_REFACTORING_CALC_COEFFICIENTS_ND +#define MGARD_X_DATA_REFACTORING_CALC_COEFFICIENTS_ND + +namespace mgard_x { + +template +void CalcCoefficientsND(Hierarchy &hierarchy, + SubArray dinput1, + SubArray dinput2, + SubArray &doutput, SIZE l, + int queue_idx) { + + std::string prefix = "decomp_"; + if (sizeof(T) == sizeof(double)) + prefix += "d_"; + if (sizeof(T) == sizeof(float)) + prefix += "f_"; + for (int d = 0; d < D; d++) + prefix += std::to_string(hierarchy.shape[d]) + "_"; + // printf("interpolate 1-3D\n"); + + SubArray dcoarse, dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, + dcoeff_rf, dcoeff_rc, dcoeff_rcf; + + DIM curr_dims[3]; + + int unprocessed_idx = 0; + curr_dims[0] = 0; + curr_dims[1] = 1; + curr_dims[2] = 2; + dinput1.project(curr_dims[0], curr_dims[1], curr_dims[2]); + doutput.project(curr_dims[0], curr_dims[1], curr_dims[2]); + + CalcCoefficientsPointers(hierarchy, curr_dims, l, doutput, dcoarse, dcoeff_f, + dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc, + dcoeff_rcf); + + GpkReo().Execute( + SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true), + SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true), + hierarchy.unprocessed_n[unprocessed_idx], + SubArray(hierarchy.unprocessed_dims[unprocessed_idx]), curr_dims[2], + curr_dims[1], curr_dims[0], + SubArray(hierarchy.ratio_array[curr_dims[2]][l]), + SubArray(hierarchy.ratio_array[curr_dims[1]][l]), + SubArray(hierarchy.ratio_array[curr_dims[0]][l]), dinput1, dcoarse, + dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc, dcoeff_rcf, + queue_idx); + + for (DIM d = 3; d < D; d += 2) { + // copy back to input1 for interpolation again + // LwpkReo().Execute(doutput, dinput1, queue_idx); + CopyND(doutput, dinput1, queue_idx); + + // printf("interpolate %u-%uD\n", d+1, d+2); + curr_dims[0] = 0; + curr_dims[1] = d; + curr_dims[2] = d + 1; + dinput1.project(curr_dims[0], curr_dims[1], curr_dims[2]); + doutput.project(curr_dims[0], curr_dims[1], curr_dims[2]); + CalcCoefficientsPointers(hierarchy, curr_dims, l, doutput, dcoarse, + dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, + dcoeff_rc, dcoeff_rcf); + + if (D - d == 1) { + unprocessed_idx += 1; + + GpkReo().Execute( + SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true), + SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true), + hierarchy.unprocessed_n[unprocessed_idx], + SubArray(hierarchy.unprocessed_dims[unprocessed_idx]), curr_dims[2], + curr_dims[1], curr_dims[0], + SubArray(hierarchy.ratio_array[curr_dims[2]][l]), + SubArray(hierarchy.ratio_array[curr_dims[1]][l]), + SubArray(hierarchy.ratio_array[curr_dims[0]][l]), dinput1, dcoarse, + dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc, + dcoeff_rcf, queue_idx); + + } else { // D - d >= 2 + unprocessed_idx += 2; + GpkReo().Execute( + SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true), + SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true), + hierarchy.unprocessed_n[unprocessed_idx], + SubArray(hierarchy.unprocessed_dims[unprocessed_idx]), + // unprocessed_dims_subarray, + curr_dims[2], curr_dims[1], curr_dims[0], + // ratio_r, ratio_c, ratio_f, + SubArray(hierarchy.ratio_array[curr_dims[2]][l]), + SubArray(hierarchy.ratio_array[curr_dims[1]][l]), + SubArray(hierarchy.ratio_array[curr_dims[0]][l]), dinput1, dcoarse, + dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc, + dcoeff_rcf, queue_idx); + } + } + + if (multidim_refactoring_debug_print) { // debug + PrintSubarray4D("after interpolation", doutput); + } // debug + + unprocessed_idx = 0; + // printf("reorder 1-3D\n"); + curr_dims[0] = 0; + curr_dims[1] = 1; + curr_dims[2] = 2; + dinput2.project(curr_dims[0], curr_dims[1], curr_dims[2]); + dinput1.project(curr_dims[0], curr_dims[1], + curr_dims[2]); // reuse input1 as temp output + + CalcCoefficientsPointers(hierarchy, curr_dims, l, dinput1, dcoarse, dcoeff_f, + dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc, + dcoeff_rcf); + + GpkReo().Execute( + SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true), + SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true), + hierarchy.unprocessed_n[unprocessed_idx], + SubArray(hierarchy.unprocessed_dims[unprocessed_idx]), curr_dims[2], + curr_dims[1], curr_dims[0], + SubArray(hierarchy.ratio_array[curr_dims[2]][l]), + SubArray(hierarchy.ratio_array[curr_dims[1]][l]), + SubArray(hierarchy.ratio_array[curr_dims[0]][l]), dinput2, dcoarse, + dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc, dcoeff_rcf, + queue_idx); + + DIM D_reduced = D % 2 == 0 ? D - 1 : D - 2; + for (DIM d = 3; d < D_reduced; d += 2) { + // copy back to input2 for reordering again + + // LwpkReo().Execute(dinput1, dinput2, queue_idx); + CopyND(dinput1, dinput2, queue_idx); + + unprocessed_idx += 2; + // printf("reorder %u-%uD\n", d+1, d+2); + curr_dims[0] = 0; + curr_dims[1] = d; + curr_dims[2] = d + 1; + dinput2.project(curr_dims[0], curr_dims[1], curr_dims[2]); + dinput1.project(curr_dims[0], curr_dims[1], + curr_dims[2]); // reuse input1 as temp output + + CalcCoefficientsPointers(hierarchy, curr_dims, l, dinput1, dcoarse, + dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, + dcoeff_rc, dcoeff_rcf); + + GpkReo().Execute( + SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true), + SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true), + hierarchy.unprocessed_n[unprocessed_idx], + SubArray(hierarchy.unprocessed_dims[unprocessed_idx]), curr_dims[2], + curr_dims[1], curr_dims[0], + SubArray(hierarchy.ratio_array[curr_dims[2]][l]), + SubArray(hierarchy.ratio_array[curr_dims[1]][l]), + SubArray(hierarchy.ratio_array[curr_dims[0]][l]), dinput2, dcoarse, + dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc, + dcoeff_rcf, queue_idx); + } + + // printf("calc coeff %u-%dD\n", D_reduced+1, D_reduced+2); + curr_dims[0] = 0; + curr_dims[1] = D_reduced; + curr_dims[2] = D_reduced + 1; + dinput1.project(curr_dims[0], curr_dims[1], curr_dims[2]); + doutput.project(curr_dims[0], curr_dims[1], + curr_dims[2]); // reuse input1 as temp output + CalcCoefficientsPointers(hierarchy, curr_dims, l, doutput, dcoarse, dcoeff_f, + dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc, + dcoeff_rcf); + if (D - D_reduced == 1) { + unprocessed_idx += 1; + GpkReo().Execute( + SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true), + SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true), + hierarchy.unprocessed_n[unprocessed_idx], + SubArray(hierarchy.unprocessed_dims[unprocessed_idx]), curr_dims[2], + curr_dims[1], curr_dims[0], + SubArray(hierarchy.ratio_array[curr_dims[2]][l]), + SubArray(hierarchy.ratio_array[curr_dims[1]][l]), + SubArray(hierarchy.ratio_array[curr_dims[0]][l]), dinput1, dcoarse, + dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc, + dcoeff_rcf, queue_idx); + + } else { // D-D_reduced == 2 + unprocessed_idx += 2; + + GpkReo().Execute( + SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true), + SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true), + hierarchy.unprocessed_n[unprocessed_idx], + SubArray(hierarchy.unprocessed_dims[unprocessed_idx]), curr_dims[2], + curr_dims[1], curr_dims[0], + SubArray(hierarchy.ratio_array[curr_dims[2]][l]), + SubArray(hierarchy.ratio_array[curr_dims[1]][l]), + SubArray(hierarchy.ratio_array[curr_dims[0]][l]), dinput1, dcoarse, + dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc, + dcoeff_rcf, queue_idx); + } + + if (multidim_refactoring_debug_print) { // debug + PrintSubarray4D("after calc coeff", doutput); + } // debug +} + +} // namespace mgard_x + +#endif \ No newline at end of file diff --git a/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/CalcCoefficientsPointers.hpp b/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/CalcCoefficientsPointers.hpp new file mode 100644 index 0000000000..e0d1948d5f --- /dev/null +++ b/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/CalcCoefficientsPointers.hpp @@ -0,0 +1,88 @@ +/* + * Copyright 2022, Oak Ridge National Laboratory. + * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs + * Author: Jieyang Chen (chenj3@ornl.gov) + * Date: March 17, 2022 + */ + +#include "../../../Hierarchy/Hierarchy.hpp" +#include "../../../RuntimeX/RuntimeX.h" + +#ifndef MGARD_X_DATA_REFACTORING_CALC_COEFFICIENT_POINTERS +#define MGARD_X_DATA_REFACTORING_CALC_COEFFICIENT_POINTERS + +namespace mgard_x { + +template +void CalcCoefficientsPointers( + Hierarchy &hierarchy, DIM curr_dims[3], DIM l, + SubArray doutput, SubArray &dcoarse, + SubArray &dcoeff_f, SubArray &dcoeff_c, + SubArray &dcoeff_r, SubArray &dcoeff_cf, + SubArray &dcoeff_rf, + SubArray &dcoeff_rc, + SubArray &dcoeff_rcf) { + + SIZE n[3]; + SIZE nn[3]; + for (DIM d = 0; d < 3; d++) { + n[d] = hierarchy.dofs[curr_dims[d]][l]; + nn[d] = hierarchy.dofs[curr_dims[d]][l + 1]; + } + + dcoarse = doutput; + dcoarse.resize(curr_dims[0], nn[0]); + dcoarse.resize(curr_dims[1], nn[1]); + dcoarse.resize(curr_dims[2], nn[2]); + + dcoeff_f = doutput; + dcoeff_f.offset(curr_dims[0], nn[0]); + dcoeff_f.resize(curr_dims[0], n[0] - nn[0]); + dcoeff_f.resize(curr_dims[1], nn[1]); + dcoeff_f.resize(curr_dims[2], nn[2]); + + dcoeff_c = doutput; + dcoeff_c.offset(curr_dims[1], nn[1]); + dcoeff_c.resize(curr_dims[0], nn[0]); + dcoeff_c.resize(curr_dims[1], n[1] - nn[1]); + dcoeff_c.resize(curr_dims[2], nn[2]); + + dcoeff_r = doutput; + dcoeff_r.offset(curr_dims[2], nn[2]); + dcoeff_r.resize(curr_dims[0], nn[0]); + dcoeff_r.resize(curr_dims[1], nn[1]); + dcoeff_r.resize(curr_dims[2], n[2] - nn[2]); + + dcoeff_cf = doutput; + dcoeff_cf.offset(curr_dims[0], nn[0]); + dcoeff_cf.offset(curr_dims[1], nn[1]); + dcoeff_cf.resize(curr_dims[0], n[0] - nn[0]); + dcoeff_cf.resize(curr_dims[1], n[1] - nn[1]); + dcoeff_cf.resize(curr_dims[2], nn[2]); + + dcoeff_rf = doutput; + dcoeff_rf.offset(curr_dims[0], nn[0]); + dcoeff_rf.offset(curr_dims[2], nn[2]); + dcoeff_rf.resize(curr_dims[0], n[0] - nn[0]); + dcoeff_rf.resize(curr_dims[1], nn[1]); + dcoeff_rf.resize(curr_dims[2], n[2] - nn[2]); + + dcoeff_rc = doutput; + dcoeff_rc.offset(curr_dims[1], nn[1]); + dcoeff_rc.offset(curr_dims[2], nn[2]); + dcoeff_rc.resize(curr_dims[0], nn[0]); + dcoeff_rc.resize(curr_dims[1], n[1] - nn[1]); + dcoeff_rc.resize(curr_dims[2], n[2] - nn[2]); + + dcoeff_rcf = doutput; + dcoeff_rcf.offset(curr_dims[0], nn[0]); + dcoeff_rcf.offset(curr_dims[1], nn[1]); + dcoeff_rcf.offset(curr_dims[2], nn[2]); + dcoeff_rcf.resize(curr_dims[0], n[0] - nn[0]); + dcoeff_rcf.resize(curr_dims[1], n[1] - nn[1]); + dcoeff_rcf.resize(curr_dims[2], n[2] - nn[2]); +} + +} // namespace mgard_x + +#endif \ No newline at end of file diff --git a/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/CoefficientsRestore3D.hpp b/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/CoefficientsRestore3D.hpp new file mode 100644 index 0000000000..ec1cbebadf --- /dev/null +++ b/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/CoefficientsRestore3D.hpp @@ -0,0 +1,94 @@ +/* + * Copyright 2022, Oak Ridge National Laboratory. + * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs + * Author: Jieyang Chen (chenj3@ornl.gov) + * Date: March 17, 2022 + */ + +#include "../../../Hierarchy/Hierarchy.hpp" +#include "../../../RuntimeX/RuntimeX.h" + +#include "../DataRefactoring.h" + +#include "GridProcessingKernel3D.hpp" + +#ifndef MGARD_X_DATA_REFACTORING_COEFFICIENTS_RESTORE_3D +#define MGARD_X_DATA_REFACTORING_COEFFICIENTS_RESTORE_3D + +namespace mgard_x { + +template +void CoefficientsRestore3D(Hierarchy &hierarchy, + SubArray dinput, + SubArray &doutput, SIZE l, + int queue_idx) { + + int range_l = std::min(6, (int)std::log2(hierarchy.dofs[0][l]) - 1); + int range_lp1 = std::min(6, (int)std::log2(hierarchy.dofs[0][l + 1]) - 1); + + std::string prefix = "decomp_"; + if (sizeof(T) == sizeof(double)) + prefix += "d_"; + if (sizeof(T) == sizeof(float)) + prefix += "f_"; + for (int d = 0; d < D; d++) + prefix += std::to_string(hierarchy.shape[d]) + "_"; + + dinput.project(0, 1, 2); + doutput.project(0, 1, 2); + + SIZE f = hierarchy.dofs[0][l]; + SIZE c = hierarchy.dofs[1][l]; + SIZE r = hierarchy.dofs[2][l]; + SIZE ff = hierarchy.dofs[0][l + 1]; + SIZE cc = hierarchy.dofs[1][l + 1]; + SIZE rr = hierarchy.dofs[2][l + 1]; + + SubArray dcoarse = dinput; + dcoarse.resize({ff, cc, rr}); + SubArray dcoeff_f = dinput; + dcoeff_f.offset({ff, 0, 0}); + dcoeff_f.resize({f - ff, cc, rr}); + SubArray dcoeff_c = dinput; + dcoeff_c.offset({0, cc, 0}); + dcoeff_c.resize({ff, c - cc, rr}); + SubArray dcoeff_r = dinput; + dcoeff_r.offset({0, 0, rr}); + dcoeff_r.resize({ff, cc, r - rr}); + SubArray dcoeff_cf = dinput; + dcoeff_cf.offset({ff, cc, 0}); + dcoeff_cf.resize({f - ff, c - cc, rr}); + SubArray dcoeff_rf = dinput; + dcoeff_rf.offset({ff, 0, rr}); + dcoeff_rf.resize({f - ff, cc, r - rr}); + SubArray dcoeff_rc = dinput; + dcoeff_rc.offset({0, cc, rr}); + dcoeff_rc.resize({ff, c - cc, r - rr}); + SubArray dcoeff_rcf = dinput; + dcoeff_rcf.offset({ff, cc, rr}); + dcoeff_rcf.resize({f - ff, c - cc, r - rr}); + + GpkRev3D().Execute( + hierarchy.dofs[2][l], hierarchy.dofs[1][l], hierarchy.dofs[0][l], + hierarchy.dofs[2][l + 1], hierarchy.dofs[1][l + 1], + hierarchy.dofs[0][l + 1], SubArray(hierarchy.ratio_array[2][l]), + SubArray(hierarchy.ratio_array[1][l]), + SubArray(hierarchy.ratio_array[0][l]), doutput, dcoarse, dcoeff_f, + dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc, dcoeff_rcf, 0, 0, 0, + hierarchy.dofs[2][l], hierarchy.dofs[1][l], hierarchy.dofs[0][l], + queue_idx); + + verify_matrix_cuda(hierarchy.dofs[2][l], hierarchy.dofs[1][l], + hierarchy.dofs[0][l], doutput.data(), doutput.getLd(0), + doutput.getLd(1), doutput.getLd(0), + prefix + "gpk_rev_3d" + "_level_" + std::to_string(l), + multidim_refactoring_store, multidim_refactoring_verify); + + if (multidim_refactoring_debug_print) { + PrintSubarray("after coeff-restore", doutput); + } +} + +} // namespace mgard_x + +#endif \ No newline at end of file diff --git a/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/CoefficientsRestoreND.hpp b/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/CoefficientsRestoreND.hpp new file mode 100644 index 0000000000..c074ed4414 --- /dev/null +++ b/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/CoefficientsRestoreND.hpp @@ -0,0 +1,232 @@ +/* + * Copyright 2022, Oak Ridge National Laboratory. + * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs + * Author: Jieyang Chen (chenj3@ornl.gov) + * Date: March 17, 2022 + */ + +#include "../../../Hierarchy/Hierarchy.hpp" +#include "../../../RuntimeX/RuntimeX.h" + +#include "../DataRefactoring.h" + +#include "CalcCoefficientsPointers.hpp" +#include "GridProcessingKernel.hpp" + +#ifndef MGARD_X_DATA_REFACTORING_COEFFICIENTS_RESTORE_ND +#define MGARD_X_DATA_REFACTORING_COEFFICIENTS_RESTORE_ND + +namespace mgard_x { + +template +void CoefficientsRestoreND(Hierarchy &hierarchy, + SubArray dinput1, + SubArray dinput2, + SubArray &doutput, SIZE l, + int queue_idx) { + + std::string prefix = "decomp_"; + if (sizeof(T) == sizeof(double)) + prefix += "d_"; + if (sizeof(T) == sizeof(float)) + prefix += "f_"; + for (int d = 0; d < D; d++) + prefix += std::to_string(hierarchy.shape[d]) + "_"; + + SubArray dcoarse, dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, + dcoeff_rf, dcoeff_rc, dcoeff_rcf; + + DIM curr_dims[3]; + int unprocessed_idx = 0; + + // printf("interpolate-restore 1-3D\n"); + curr_dims[0] = 0; + curr_dims[1] = 1; + curr_dims[2] = 2; + dinput1.project(curr_dims[0], curr_dims[1], curr_dims[2]); + doutput.project(curr_dims[0], curr_dims[1], curr_dims[2]); + + CalcCoefficientsPointers(hierarchy, curr_dims, l, dinput1, dcoarse, dcoeff_f, + dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc, + dcoeff_rcf); + + GpkRev().Execute( + SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true), + SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true), + hierarchy.unprocessed_n[unprocessed_idx], + SubArray(hierarchy.unprocessed_dims[unprocessed_idx]), curr_dims[2], + curr_dims[1], curr_dims[0], + SubArray(hierarchy.ratio_array[curr_dims[2]][l]), + SubArray(hierarchy.ratio_array[curr_dims[1]][l]), + SubArray(hierarchy.ratio_array[curr_dims[0]][l]), doutput, dcoarse, + dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc, dcoeff_rcf, + 0, 0, 0, hierarchy.dofs[curr_dims[2]][l], hierarchy.dofs[curr_dims[1]][l], + hierarchy.dofs[curr_dims[0]][l], queue_idx); + + for (DIM d = 3; d < D; d += 2) { + // LwpkReo().Execute(doutput, dinput1, queue_idx); + CopyND(doutput, dinput1, queue_idx); + + // printf("interpolate-restore %u-%uD\n", d+1, d+2); + curr_dims[0] = 0; + curr_dims[1] = d; + curr_dims[2] = d + 1; + dinput1.project(curr_dims[0], curr_dims[1], curr_dims[2]); + doutput.project(curr_dims[0], curr_dims[1], curr_dims[2]); + CalcCoefficientsPointers(hierarchy, curr_dims, l, dinput1, dcoarse, + dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, + dcoeff_rc, dcoeff_rcf); + + if (D - d == 1) { + unprocessed_idx += 1; + GpkRev().Execute( + SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true), + SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true), + hierarchy.unprocessed_n[unprocessed_idx], + SubArray(hierarchy.unprocessed_dims[unprocessed_idx]), curr_dims[2], + curr_dims[1], curr_dims[0], + SubArray(hierarchy.ratio_array[curr_dims[2]][l]), + SubArray(hierarchy.ratio_array[curr_dims[1]][l]), + SubArray(hierarchy.ratio_array[curr_dims[0]][l]), doutput, dcoarse, + dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc, + dcoeff_rcf, 0, 0, 0, hierarchy.dofs[curr_dims[2]][l], + hierarchy.dofs[curr_dims[1]][l], hierarchy.dofs[curr_dims[0]][l], + queue_idx); + + } else { // D - d >= 2 + unprocessed_idx += 2; + GpkRev().Execute( + SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true), + SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true), + hierarchy.unprocessed_n[unprocessed_idx], + SubArray(hierarchy.unprocessed_dims[unprocessed_idx]), curr_dims[2], + curr_dims[1], curr_dims[0], + SubArray(hierarchy.ratio_array[curr_dims[2]][l]), + SubArray(hierarchy.ratio_array[curr_dims[1]][l]), + SubArray(hierarchy.ratio_array[curr_dims[0]][l]), doutput, dcoarse, + dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc, + dcoeff_rcf, 0, 0, 0, hierarchy.dofs[curr_dims[2]][l], + hierarchy.dofs[curr_dims[1]][l], hierarchy.dofs[curr_dims[0]][l], + queue_idx); + } + } + // Done interpolation-restore on doutput + + if (multidim_refactoring_debug_print) { // debug + PrintSubarray4D("After interpolation reverse-reorder", doutput); + } // debug + + unprocessed_idx = 0; + + // printf("reorder-restore 1-3D\n"); + curr_dims[0] = 0; + curr_dims[1] = 1; + curr_dims[2] = 2; + dinput2.project(curr_dims[0], curr_dims[1], curr_dims[2]); + dinput1.project(curr_dims[0], curr_dims[1], + curr_dims[2]); // reuse input1 as temp space + + CalcCoefficientsPointers(hierarchy, curr_dims, l, dinput2, dcoarse, dcoeff_f, + dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc, + dcoeff_rcf); + + GpkRev().Execute( + SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true), + SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true), + hierarchy.unprocessed_n[unprocessed_idx], + SubArray(hierarchy.unprocessed_dims[unprocessed_idx]), curr_dims[2], + curr_dims[1], curr_dims[0], + SubArray(hierarchy.ratio_array[curr_dims[2]][l]), + SubArray(hierarchy.ratio_array[curr_dims[1]][l]), + SubArray(hierarchy.ratio_array[curr_dims[0]][l]), dinput1, dcoarse, + dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc, dcoeff_rcf, + 0, 0, 0, hierarchy.dofs[curr_dims[2]][l], hierarchy.dofs[curr_dims[1]][l], + hierarchy.dofs[curr_dims[0]][l], queue_idx); + + DIM D_reduced = D % 2 == 0 ? D - 1 : D - 2; + for (DIM d = 3; d < D_reduced; d += 2) { + // printf("reorder-reverse\n"); + // copy back to input2 for reordering again + // LwpkReo().Execute(dinput1, dinput2, queue_idx); + CopyND(dinput1, dinput2, queue_idx); + + // printf("reorder-restore %u-%uD\n", d+1, d+2); + curr_dims[0] = 0; + curr_dims[1] = d; + curr_dims[2] = d + 1; + dinput2.project(curr_dims[0], curr_dims[1], curr_dims[2]); + dinput1.project(curr_dims[0], curr_dims[1], + curr_dims[2]); // reuse input1 as temp output + + CalcCoefficientsPointers(hierarchy, curr_dims, l, dinput2, dcoarse, + dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, + dcoeff_rc, dcoeff_rcf); + + unprocessed_idx += 2; + GpkRev().Execute( + SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true), + SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true), + hierarchy.unprocessed_n[unprocessed_idx], + SubArray(hierarchy.unprocessed_dims[unprocessed_idx]), curr_dims[2], + curr_dims[1], curr_dims[0], + SubArray(hierarchy.ratio_array[curr_dims[2]][l]), + SubArray(hierarchy.ratio_array[curr_dims[1]][l]), + SubArray(hierarchy.ratio_array[curr_dims[0]][l]), dinput1, dcoarse, + dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc, + dcoeff_rcf, 0, 0, 0, hierarchy.dofs[curr_dims[2]][l], + hierarchy.dofs[curr_dims[1]][l], hierarchy.dofs[curr_dims[0]][l], + queue_idx); + } + + // printf("coeff-restore %u-%dD\n", D_reduced+1, D_reduced+2); + curr_dims[0] = 0; + curr_dims[1] = D_reduced; + curr_dims[2] = D_reduced + 1; + dinput1.project(curr_dims[0], curr_dims[1], curr_dims[2]); + doutput.project(curr_dims[0], curr_dims[1], curr_dims[2]); + CalcCoefficientsPointers(hierarchy, curr_dims, l, dinput1, dcoarse, dcoeff_f, + dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc, + dcoeff_rcf); + + if (D - D_reduced == 1) { + // printf("coeff-restore %u-%dD\n", D_reduced+1, D_reduced+1); + unprocessed_idx += 1; + GpkRev().Execute( + SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true), + SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true), + hierarchy.unprocessed_n[unprocessed_idx], + SubArray(hierarchy.unprocessed_dims[unprocessed_idx]), curr_dims[2], + curr_dims[1], curr_dims[0], + SubArray(hierarchy.ratio_array[curr_dims[2]][l]), + SubArray(hierarchy.ratio_array[curr_dims[1]][l]), + SubArray(hierarchy.ratio_array[curr_dims[0]][l]), doutput, dcoarse, + dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc, + dcoeff_rcf, 0, 0, 0, hierarchy.dofs[curr_dims[2]][l], + hierarchy.dofs[curr_dims[1]][l], hierarchy.dofs[curr_dims[0]][l], + queue_idx); + } else { // D - D_reduced >= 2 + // printf("coeff-restore %u-%dD\n", D_reduced+1, D_reduced+2); + unprocessed_idx += 2; + GpkRev().Execute( + SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true), + SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true), + hierarchy.unprocessed_n[unprocessed_idx], + SubArray(hierarchy.unprocessed_dims[unprocessed_idx]), curr_dims[2], + curr_dims[1], curr_dims[0], + SubArray(hierarchy.ratio_array[curr_dims[2]][l]), + SubArray(hierarchy.ratio_array[curr_dims[1]][l]), + SubArray(hierarchy.ratio_array[curr_dims[0]][l]), doutput, dcoarse, + dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc, + dcoeff_rcf, 0, 0, 0, hierarchy.dofs[curr_dims[2]][l], + hierarchy.dofs[curr_dims[1]][l], hierarchy.dofs[curr_dims[0]][l], + queue_idx); + } + + if (multidim_refactoring_debug_print) { // debug + PrintSubarray4D("After coeff restore", doutput); + } // debug +} + +} // namespace mgard_x + +#endif \ No newline at end of file diff --git a/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/GridProcessingKernel.h b/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/GridProcessingKernel.h deleted file mode 100644 index c60b73de2c..0000000000 --- a/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/GridProcessingKernel.h +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#ifndef MGARD_X_GRID_PROCESSING_KERNEL -#define MGARD_X_GRID_PROCESSING_KERNEL - -#include "../../Common.h" - -namespace mgard_x { - -template -void gpk_reo(Handle &handle, SIZE *shape_h, SIZE *shape_d, - SIZE *shape_c_d, SIZE *ldvs, SIZE *ldws, DIM unprocessed_n, - DIM *unprocessed_dims, DIM curr_dim_r, DIM curr_dim_c, - DIM curr_dim_f, T *dratio_r, T *dratio_c, T *dratio_f, T *dv, - LENGTH lddv1, LENGTH lddv2, T *dw, LENGTH lddw1, LENGTH lddw2, - T *dwf, LENGTH lddwf1, LENGTH lddwf2, T *dwc, LENGTH lddwc1, - LENGTH lddwc2, T *dwr, LENGTH lddwr1, LENGTH lddwr2, T *dwcf, - LENGTH lddwcf1, LENGTH lddwcf2, T *dwrf, LENGTH lddwrf1, - LENGTH lddwrf2, T *dwrc, LENGTH lddwrc1, LENGTH lddwrc2, T *dwrcf, - LENGTH lddwrcf1, LENGTH lddwrcf2, int queue_idx, int config); - -template -void gpk_rev(Handle &handle, SIZE *shape_h, SIZE *shape_d, - SIZE *shape_c_d, SIZE *ldvs, SIZE *ldws, DIM unprocessed_n, - DIM *unprocessed_dims, DIM curr_dim_r, DIM curr_dim_c, - DIM curr_dim_f, T *dratio_r, T *dratio_c, T *dratio_f, T *dv, - LENGTH lddv1, LENGTH lddv2, T *dw, LENGTH lddw1, LENGTH lddw2, - T *dwf, LENGTH lddwf1, LENGTH lddwf2, T *dwc, LENGTH lddwc1, - LENGTH lddwc2, T *dwr, LENGTH lddwr1, LENGTH lddwr2, T *dwcf, - LENGTH lddwcf1, LENGTH lddwcf2, T *dwrf, LENGTH lddwrf1, - LENGTH lddwrf2, T *dwrc, LENGTH lddwrc1, LENGTH lddwrc2, T *dwrcf, - LENGTH lddwrcf1, LENGTH lddwrcf2, SIZE svr, SIZE svc, SIZE svf, - SIZE nvr, SIZE nvc, SIZE nvf, int queue_idx, int config); - -template -class GpkReo; - -} // namespace mgard_x - -#endif \ No newline at end of file diff --git a/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/GridProcessingKernel.hpp b/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/GridProcessingKernel.hpp index f022bcdf77..f56c5188c2 100644 --- a/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/GridProcessingKernel.hpp +++ b/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/GridProcessingKernel.hpp @@ -2267,15 +2267,12 @@ class GpkReo : public AutoTuner { SubArray wrc, SubArray wrcf, int queue_idx) { int range_l = std::min(6, (int)std::log2(shape.dataHost()[curr_dim_f]) - 1); - int arch = DeviceRuntime::GetArchitectureGeneration(); int prec = TypeToIdx(); - // int config = - // AutoTuner::autoTuningTable.auto_tuning_cc[arch][prec][range_l]; int config = AutoTuner::autoTuningTable.gpk_reo_nd[prec][range_l]; - double min_time = std::numeric_limits::max(); int min_config = 0; + ExecutionReturn ret; #define GPK(CONFIG) \ if (config == CONFIG || AutoTuner::ProfileKernels) { \ @@ -2291,22 +2288,26 @@ class GpkReo : public AutoTuner { curr_dim_c, curr_dim_f, ratio_r, ratio_c, ratio_f, v, w, wf, wc, wr, \ wcf, wrf, wrc, wrcf, queue_idx); \ DeviceAdapter adapter; \ - ExecutionReturn ret = adapter.Execute(task); \ + ret = adapter.Execute(task); \ if (AutoTuner::ProfileKernels) { \ - if (min_time > ret.execution_time) { \ + if (ret.success && min_time > ret.execution_time) { \ min_time = ret.execution_time; \ min_config = CONFIG; \ } \ } \ } - GPK(0) - GPK(1) - GPK(2) - GPK(3) - GPK(4) - GPK(5) - GPK(6) + GPK(6) if (!ret.success) config--; + GPK(5) if (!ret.success) config--; + GPK(4) if (!ret.success) config--; + GPK(3) if (!ret.success) config--; + GPK(2) if (!ret.success) config--; + GPK(1) if (!ret.success) config--; + GPK(0) if (!ret.success) config--; + if (config < 0 && !ret.success) { + std::cout << log::log_err << "no suitable config for GpkReo.\n"; + exit(-1); + } #undef GPK if (AutoTuner::ProfileKernels) { @@ -4686,15 +4687,12 @@ class GpkRev : public AutoTuner { SubArray wrcf, SIZE svr, SIZE svc, SIZE svf, SIZE nvr, SIZE nvc, SIZE nvf, int queue_idx) { int range_l = std::min(6, (int)std::log2(shape.dataHost()[curr_dim_f]) - 1); - int arch = DeviceRuntime::GetArchitectureGeneration(); int prec = TypeToIdx(); - // int config = - // AutoTuner::autoTuningTable.auto_tuning_cc[arch][prec][range_l]; int config = AutoTuner::autoTuningTable.gpk_rev_nd[prec][range_l]; - double min_time = std::numeric_limits::max(); int min_config = 0; + ExecutionReturn ret; #define GPK(CONFIG) \ if (config == CONFIG || AutoTuner::ProfileKernels) { \ @@ -4710,22 +4708,26 @@ class GpkRev : public AutoTuner { curr_dim_c, curr_dim_f, ratio_r, ratio_c, ratio_f, v, w, wf, wc, wr, \ wcf, wrf, wrc, wrcf, svr, svc, svf, nvr, nvc, nvf, queue_idx); \ DeviceAdapter adapter; \ - ExecutionReturn ret = adapter.Execute(task); \ + ret = adapter.Execute(task); \ if (AutoTuner::ProfileKernels) { \ - if (min_time > ret.execution_time) { \ + if (ret.success && min_time > ret.execution_time) { \ min_time = ret.execution_time; \ min_config = CONFIG; \ } \ } \ } - GPK(0) - GPK(1) - GPK(2) - GPK(3) - GPK(4) - GPK(5) - GPK(6) + GPK(6) if (!ret.success) config--; + GPK(5) if (!ret.success) config--; + GPK(4) if (!ret.success) config--; + GPK(3) if (!ret.success) config--; + GPK(2) if (!ret.success) config--; + GPK(1) if (!ret.success) config--; + GPK(0) if (!ret.success) config--; + if (config < 0 && !ret.success) { + std::cout << log::log_err << "no suitable config for GpkRev.\n"; + exit(-1); + } #undef GPK if (AutoTuner::ProfileKernels) { @@ -4734,4512 +4736,6 @@ class GpkRev : public AutoTuner { } }; -// template -// __global__ void -// _gpk_reo(SIZE *shape, SIZE *shape_c, SIZE *ldvs, SIZE *ldws, DIM -// unprocessed_n, -// DIM *unprocessed_dims, DIM curr_dim_r, DIM curr_dim_c, DIM -// curr_dim_f, T *dratio_r, T *dratio_c, T *dratio_f, T *dv, LENGTH -// lddv1, LENGTH lddv2, T *dw, LENGTH lddw1, LENGTH lddw2, T *dwf, -// LENGTH lddwf1, LENGTH lddwf2, T *dwc, LENGTH lddwc1, LENGTH lddwc2, -// T *dwr, LENGTH lddwr1, LENGTH lddwr2, T *dwcf, LENGTH lddwcf1, -// LENGTH lddwcf2, T *dwrf, LENGTH lddwrf1, LENGTH lddwrf2, T *dwrc, -// LENGTH lddwrc1, LENGTH lddwrc2, T *dwrcf, LENGTH lddwrcf1, LENGTH -// lddwrcf2) { - -// // bool debug = false; -// // if (FunctorBase::GetBlockIdX() == 0 && -// FunctorBase::GetBlockIdY() ==0 && -// FunctorBase::GetBlockIdZ() == 0 && -// // threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) debug = -// // false; - -// // volatile clock_t start = 0; -// // volatile clock_t end = 0; -// // volatile unsigned long long sum_time = 0; - -// LENGTH threadId = (threadIdx.z * (FunctorBase::GetBlockDimX() * -// FunctorBase::GetBlockDimY())) + -// (threadIdx.y * FunctorBase::GetBlockDimX()) + -// threadIdx.x; - -// SIZE nr, nc, nf; -// SIZE nr_c, nc_c, nf_c; -// SIZE r, c, f; -// SIZE rest_r, rest_c, rest_f; -// SIZE nr_p, nc_p, nf_p; -// SIZE rest_r_p, rest_c_p, rest_f_p; -// SIZE r_sm, c_sm, f_sm; -// SIZE r_sm_ex, c_sm_ex, f_sm_ex; -// SIZE r_gl, c_gl, f_gl; -// SIZE r_gl_ex, c_gl_ex, f_gl_ex; -// T res; -// bool in_next = true; - -// T *sm = SharedMemory(); -// SIZE ldsm1 = (F/2) * 2 + 1; -// SIZE ldsm2 = (C/2) * 2 + 1; - -// T *v_sm = sm; sm += ((F/2) * 2 + 1) * ((C/2) * 2 + 1) * ((R/2) * 2 + 1); -// T *ratio_f_sm = sm; sm += (F/2) * 2; -// T *ratio_c_sm = sm; sm += (C/2) * 2; -// T *ratio_r_sm = sm; sm += (R/2) * 2; - -// SIZE * sm_size = (SIZE*)sm; -// SIZE *shape_sm = sm_size; sm_size += D_GLOBAL; -// SIZE *shape_c_sm = sm_size; sm_size += D_GLOBAL; -// SIZE *ldvs_sm = sm_size; sm_size += D_GLOBAL; -// SIZE *ldws_sm = sm_size; sm_size += D_GLOBAL; -// sm = (T*)sm_size; - -// DIM * sm_dim = (DIM*)sm; -// DIM *unprocessed_dims_sm = sm_dim; sm_dim += D_GLOBAL; -// sm = (T*)sm_dim; - -// SIZE idx[D_GLOBAL]; -// if (threadId < D_GLOBAL) { -// shape_sm[threadId] = shape[threadId]; -// shape_c_sm[threadId] = shape_c[threadId]; -// ldvs_sm[threadId] = ldvs[threadId]; -// ldws_sm[threadId] = ldws[threadId]; -// } - -// if (threadId < unprocessed_n) { -// unprocessed_dims_sm[threadId] = unprocessed_dims[threadId]; -// } -// __syncthreads(); - -// for (DIM d = 0; d < D_GLOBAL; d++) -// idx[d] = 0; - -// nr = shape_sm[curr_dim_r]; -// nc = shape_sm[curr_dim_c]; -// nf = shape_sm[curr_dim_f]; - -// nr_c = shape_c_sm[curr_dim_r]; -// nc_c = shape_c_sm[curr_dim_c]; -// nf_c = shape_c_sm[curr_dim_f]; - -// if (D_LOCAL < 3) { -// nr = 1; -// nr_c = 1; -// } -// if (D_LOCAL < 2) { -// nc = 1; -// nc_c = 1; -// } - -// r = FunctorBase::GetBlockIdZ() * -// FunctorBase::GetBlockDimZ(); c = -// FunctorBase::GetBlockIdY() * -// FunctorBase::GetBlockDimY(); SIZE bidx = -// FunctorBase::GetBlockIdX(); SIZE firstD = -// div_roundup(shape_sm[0] - 1, FunctorBase::GetBlockDimX()); f = -// (bidx % firstD) * FunctorBase::GetBlockDimX(); - -// bidx /= firstD; - -// // if (debug) printf("n: %d %d %d rcf: %d %d %d\n", nr, nc, nf, r, c, f); -// rest_r = nr - r; -// rest_c = nc - c; -// rest_f = nf - f; - -// nr_p = nr; -// nc_p = nc; -// nf_p = nf; - -// rest_r_p = rest_r; -// rest_c_p = rest_c; -// rest_f_p = rest_f; - -// if (nr % 2 == 0) { -// nr_p = nr + 1; -// rest_r_p = nr_p - r; -// } -// if (nc % 2 == 0) { -// nc_p = nc + 1; -// rest_c_p = nc_p - c; -// } -// if (nf % 2 == 0) { -// nf_p = nf + 1; -// rest_f_p = nf_p - f; -// } - -// for (DIM d = 0; d < D_GLOBAL; d++) { -// if (D_LOCAL == 3 && d != curr_dim_r && d != curr_dim_c && d != -// curr_dim_f) { -// idx[d] = bidx % shape_sm[d]; -// bidx /= shape_sm[d]; -// if (idx[d] >= shape_c_sm[d]) -// in_next = false; -// } -// if (D_LOCAL == 2 && d != curr_dim_c && d != curr_dim_f) { -// idx[d] = bidx % shape_sm[d]; -// bidx /= shape_sm[d]; -// if (idx[d] >= shape_c_sm[d]) -// in_next = false; -// } -// } - -// int skip = 0; -// #pragma unroll 1 -// for (DIM t = 0; t < D_GLOBAL; t++) { -// for (DIM k = 0; k < unprocessed_n; k++) { -// if (t == unprocessed_dims_sm[k] && -// (shape_sm[t] % 2 == 1 && idx[t] % 2 == 1 || -// shape_sm[t] % 2 == 0 && idx[t] % 2 == 1 && -// idx[t] != shape_sm[t] - 1)) { -// skip = 1; -// } -// } -// } - -// // if (FunctorBase::GetBlockIdX() == 0 && -// FunctorBase::GetBlockIdY() == 0 && -// FunctorBase::GetBlockIdZ() == 0) { -// // if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) { -// // printf("total_idx_sm: %d %d %d %d (skip: %d)\n", idx[3], idx[2], -// idx[1], -// // idx[0], skip); -// // } -// // } - -// LENGTH other_offset_v = get_idx(ldvs_sm, idx); -// LENGTH other_offset_w = get_idx(ldws_sm, idx); - -// dv = dv + other_offset_v; -// dw = dw + other_offset_w; -// dwr = dwr + other_offset_w; -// dwc = dwc + other_offset_w; -// dwf = dwf + other_offset_w; -// dwrf = dwrf + other_offset_w; -// dwrc = dwrc + other_offset_w; -// dwcf = dwcf + other_offset_w; -// dwrcf = dwrcf + other_offset_w; - -// if (TYPE == 2) { -// dwf = dw; -// dwcf = dwc; -// dwrf = dwr; -// dwrcf = dwrc; -// } -// __syncthreads(); -// // if (!skip) -// { -// r_sm = threadIdx.z; -// c_sm = threadIdx.y; -// f_sm = threadIdx.x; - -// r_sm_ex = (R/2) * 2; -// c_sm_ex = (C/2) * 2; -// f_sm_ex = (F/2) * 2; - -// r_gl = r + r_sm; -// r_gl_ex = r + (R/2) * 2; -// c_gl = c + c_sm; -// c_gl_ex = c + (C/2) * 2; -// f_gl = f + f_sm; -// f_gl_ex = f + (F/2) * 2; - -// // __syncthreads(); -// // if (r_sm == 0 && c_sm == 0 && f_sm == 0) { -// // //printf("setting zeros\n"); -// // for (int i = 0; i < (R/2) * 2 + 1; i++) { -// // for (int j = 0; j < (C/2) * 2 + 1; j++) { -// // for (int k = 0; k < (F/2) * 2 + 1; k++) { -// // v_sm[get_idx(ldsm1, ldsm2, i, j, k)] = 0.0; -// // } -// // } -// // } -// // //printf("done zeros\n"); -// // } -// // __syncthreads(); -// /* Load v */ -// // loading extra rules -// // case 1: input = odd (non-padding required) -// // case 1.a: block size < rest (need to load extra); -// // case 1.b: block size > rest (NO need to load extra); -// // case 2: input = even (padding requried) -// // case 2.a: block size < rest (need to load extra); -// // case 2.b: block size >= rest (NO need to load extra, but need -// // padding); - -// // Load from dv -// if (r_sm < rest_r && c_sm < rest_c && f_sm < rest_f) { - -// // load cubic -// // asm volatile("membar.cta;"); -// // start = clock64(); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = -// dv[get_idx(lddv1, lddv2, r_gl, c_gl, f_gl)]; -// // if (FunctorBase::GetBlockIdX()==0 && -// FunctorBase::GetBlockIdY()==0&&FunctorBase::GetBlockIdZ()==0) -// { -// // printf("load (%d %d %d) %f <- %d+(%d %d %d) (ld: %d %d)\n", -// // r_sm, c_sm, f_sm, -// // dv[get_idx(lddv1, lddv2, r_gl, c_gl, f_gl)], -// // other_offset_v+r_gl, c_gl, f_gl, lddv1, lddv2); -// // } -// if (r_sm == 0) { -// if (rest_r > (R/2) * 2) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm, f_sm)] = -// dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl, f_gl)]; -// } -// } -// if (c_sm == 0) { -// if (rest_c > (C/2) * 2) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm_ex, f_sm)] = -// dv[get_idx(lddv1, lddv2, r_gl, c_gl_ex, f_gl)]; -// } -// } -// if (f_sm == 0) { -// if (rest_f > (F/2) * 2) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm_ex)] = -// dv[get_idx(lddv1, lddv2, r_gl, c_gl, f_gl_ex)]; -// } -// } -// if (c_sm == 0 && f_sm == 0) { -// if (rest_c > (C/2) * 2 && rest_f > (F/2) * 2) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm_ex, f_sm_ex)] = -// dv[get_idx(lddv1, lddv2, r_gl, c_gl_ex, f_gl_ex)]; -// } -// } -// if (r_sm == 0 && f_sm == 0) { -// if (rest_r > (R/2) * 2 && rest_f > (F/2) * 2) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm, f_sm_ex)] = -// dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl, f_gl_ex)]; -// } -// } -// if (r_sm == 0 && c_sm == 0) { -// if (rest_r > (R/2) * 2 && rest_c > (C/2) * 2) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm_ex, f_sm)] = -// dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl)]; -// } -// } -// if (r_sm == 0 && c_sm == 0 && f_sm == 0) { -// if (rest_r > (R/2) * 2 && rest_c > (C/2) * 2 && rest_f > (F/2) * 2) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm_ex, f_sm_ex)] = -// dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl_ex)]; -// } -// } -// } - -// __syncthreads(); - -// // apply padding is necessary -// if (r_sm < rest_r && c_sm < rest_c && f_sm < rest_f) { - -// // printf("load main[%d %d %d]:%f --> [%d %d %d] (%d %d %d)\n", r_gl, -// // c_gl, f_gl, -// // dv[get_idx(lddv1, lddv2, r_gl, c_gl, f_gl)], r_sm, c_sm, f_sm, -// nr, -// // nc, nf); - -// // asm volatile("membar.cta;"); -// // start = clock64() - start; -// // printf("[load main] block id %d,%d,%d elapsed %lu\n", -// FunctorBase::GetBlockIdZ(), -// // FunctorBase::GetBlockIdY(), -// FunctorBase::GetBlockIdX(), start); start = clock64(); - -// // load extra surface - -// if (r_sm == 0) { -// if (rest_r > (R/2) * 2) { -// // v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm, f_sm)] = -// // dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl, f_gl)]; -// // printf("load-r[%d %d %d]:%f --> [%d %d %d]\n", r_gl_ex, c_gl, -// f_gl, -// // dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl, f_gl)], r_sm_ex, c_sm, -// // f_sm); -// } else if (nr % 2 == 0) { -// // if (r == 16 && c == 0 && f == 0) { -// // printf("padding (%d %d %d) %f <- (%f %f %f)\n", rest_r_p - 1, -// // c_sm, f_sm, -// // v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, c_sm, f_sm)], -// rest_r -// // - 1, c_sm, f_sm); -// // padded = true; -// // aa = v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, c_sm, f_sm)]; -// // bb = v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm, f_sm)]; -// // } -// v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm, f_sm)] = -// v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, c_sm, f_sm)]; -// } -// } - -// if (c_sm == 0) { -// if (rest_c > (C/2) * 2) { -// // v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm_ex, f_sm)] = -// // dv[get_idx(lddv1, lddv2, r_gl, c_gl_ex, f_gl)]; -// // printf("load-c[%d %d %d]:%f --> [%d %d %d]\n", r_gl, c_gl_ex, -// f_gl, -// // dv[get_idx(lddv1, lddv2, r_gl, c_gl_ex, f_gl)], r_sm, c_sm_ex, -// // f_sm); -// } else if (nc % 2 == 0) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, rest_c_p - 1, f_sm)] = -// v_sm[get_idx(ldsm1, ldsm2, r_sm, rest_c - 1, f_sm)]; -// } -// } - -// if (f_sm == 0) { -// if (rest_f > (F/2) * 2) { -// // v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm_ex)] = -// // dv[get_idx(lddv1, lddv2, r_gl, c_gl, f_gl_ex)]; -// // printf("load-f[%d %d %d]:%f --> [%d %d %d]\n", r_gl, c_gl, -// f_gl_ex, -// // dv[get_idx(lddv1, lddv2, r_gl, c_gl, f_gl_ex)], r_sm, c_sm, -// // f_sm_ex); -// } else if (nf % 2 == 0) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, rest_f_p - 1)] = -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, rest_f - 1)]; -// } -// } - -// // load extra edges -// if (c_sm == 0 && f_sm == 0) { -// if (rest_c > (C/2) * 2 && rest_f > (F/2) * 2) { -// // v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm_ex, f_sm_ex)] = -// // dv[get_idx(lddv1, lddv2, r_gl, c_gl_ex, f_gl_ex)]; -// // printf("load-cf[%d %d %d]:%f --> [%d %d %d]\n", r_gl, c_gl_ex, -// // f_gl_ex, dv[get_idx(lddv1, lddv2, r_gl, c_gl_ex, f_gl_ex)], -// r_sm, -// // c_sm_ex, f_sm_ex); -// } else if (rest_c <= (C/2) * 2 && rest_f <= (F/2) * 2 && nc % 2 == 0 -// && -// nf % 2 == 0) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, rest_c_p - 1, rest_f_p - 1)] = -// v_sm[get_idx(ldsm1, ldsm2, r_sm, rest_c - 1, rest_f - 1)]; -// } else if (rest_c > (C/2) * 2 && rest_f <= (F/2) * 2 && nf % 2 == 0) -// { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm_ex, rest_f_p - 1)] = -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm_ex, rest_f - 1)]; -// } else if (rest_c <= (C/2) * 2 && rest_f > (F/2) * 2 && nc % 2 == 0) -// { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, rest_c_p - 1, f_sm_ex)] = -// v_sm[get_idx(ldsm1, ldsm2, r_sm, rest_c - 1, f_sm_ex)]; -// } -// } - -// if (r_sm == 0 && f_sm == 0) { -// if (rest_r > (R/2) * 2 && rest_f > (F/2) * 2) { -// // v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm, f_sm_ex)] = -// // dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl, f_gl_ex)]; -// // printf("load-rf[%d %d %d]:%f --> [%d %d %d]\n", r_gl_ex, c_gl, -// // f_gl_ex, dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl, f_gl_ex)], -// // r_sm_ex, c_sm, f_sm_ex); -// } else if (rest_r <= (R/2) * 2 && rest_f <= (F/2) * 2 && nr % 2 == 0 -// && -// nf % 2 == 0) { -// // printf("padding (%d %d %d) <- (%d %d %d)\n", rest_r_p - 1, c_sm, -// // rest_f_p - 1, rest_r - 1, c_sm, rest_f - 1); -// v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm, rest_f_p - 1)] = -// v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, c_sm, rest_f - 1)]; -// } else if (rest_r > (R/2) * 2 && rest_f <= (F/2) * 2 && nf % 2 == 0) -// { -// v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm, rest_f_p - 1)] = -// v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm, rest_f - 1)]; -// } else if (rest_r <= (R/2) * 2 && rest_f > (F/2) * 2 && nr % 2 == 0) -// { -// v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm, f_sm_ex)] = -// v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, c_sm, f_sm_ex)]; -// } -// } - -// if (r_sm == 0 && c_sm == 0) { -// if (rest_r > (R/2) * 2 && rest_c > (C/2) * 2) { -// // v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm_ex, f_sm)] = -// // dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl)]; -// // printf("load-rc[%d %d %d]:%f --> [%d %d %d]\n", r_gl_ex, -// c_gl_ex, -// // f_gl, dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl)], -// r_sm_ex, -// // c_sm_ex, f_sm); -// } else if (rest_r <= (R/2) * 2 && rest_c <= (C/2) * 2 && nr % 2 == 0 -// && -// nc % 2 == 0) { -// v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, rest_c_p - 1, f_sm)] = -// v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, rest_c - 1, f_sm)]; -// // printf("padding (%d %d %d) <- (%d %d %d): %f\n", rest_r_p - 1, -// // rest_c_p - 1, f_sm, rest_r - 1, rest_c - 1, f_sm, -// // v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, rest_c - 1, f_sm)]); -// } else if (rest_r > (R/2) * 2 && rest_c <= (C/2) * 2 && nc % 2 == 0) -// { -// v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, rest_c_p - 1, f_sm)] = -// v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, rest_c - 1, f_sm)]; -// } else if (rest_r <= (R/2) * 2 && rest_c > (C/2) * 2 && nr % 2 == 0) -// { -// v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm_ex, f_sm)] = -// v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, c_sm_ex, f_sm)]; -// } -// } -// // load extra vertex - -// if (r_sm == 0 && c_sm == 0 && f_sm == 0) { -// if (rest_r > (R/2) * 2 && rest_c > (C/2) * 2 && rest_f > (F/2) * 2) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm_ex, f_sm_ex)] = -// dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl_ex)]; -// // printf("load-rcf[%d %d %d]:%f --> [%d %d %d]\n", r_gl_ex, -// c_gl_ex, -// // f_gl_ex, dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl_ex)], -// // r_sm_ex, c_sm_ex, f_sm_ex); -// } else if (rest_r <= (R/2) * 2 && rest_c <= (C/2) * 2 && rest_f <= -// (F/2) * 2 && -// nr % 2 == 0 && nc % 2 == 0 && nf % 2 == 0) { -// v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, rest_c_p - 1, -// rest_f_p - 1)] = -// v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, rest_c - 1, rest_f - -// 1)]; -// } else if (rest_r > (R/2) * 2 && rest_c > (C/2) * 2 && rest_f <= -// (F/2) * 2 && -// nf % 2 == 0) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm_ex, rest_f_p - 1)] = -// v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm_ex, rest_f - 1)]; -// } else if (rest_r > (R/2) * 2 && rest_c <= (C/2) * 2 && rest_f > -// (F/2) * 2 && -// nc % 2 == 0) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, rest_c_p - 1, f_sm_ex)] = -// v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, rest_c - 1, f_sm_ex)]; -// } else if (rest_r > (R/2) * 2 && rest_c <= (C/2) * 2 && rest_f <= -// (F/2) * 2 && -// nc % 2 == 0 && nf % 2 == 0) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, rest_c_p - 1, rest_f_p - 1)] = -// v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, rest_c - 1, rest_f - 1)]; -// } else if (rest_r <= (R/2) * 2 && rest_c > (C/2) * 2 && rest_f > -// (F/2) * 2 && -// nr % 2 == 0) { -// v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm_ex, f_sm_ex)] = -// v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, c_sm_ex, f_sm_ex)]; -// } else if (rest_r <= (R/2) * 2 && rest_c > (C/2) * 2 && rest_f <= -// (F/2) * 2 && -// nr % 2 == 0 && nf % 2 == 0) { -// v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm_ex, rest_f_p - 1)] = -// v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, c_sm_ex, rest_f - 1)]; -// } else if (rest_r <= (R/2) * 2 && rest_c <= (C/2) * 2 && rest_f > -// (F/2) * 2 && -// nr % 2 == 0 && nc % 2 == 0) { -// v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, rest_c_p - 1, f_sm_ex)] = -// v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, rest_c - 1, f_sm_ex)]; -// } -// } - -// // asm volatile("membar.cta;"); -// // start = clock64() - start; -// // printf("[load extra] block id %d,%d,%d elapsed %lu\n", -// FunctorBase::GetBlockIdZ(), -// // FunctorBase::GetBlockIdY(), -// FunctorBase::GetBlockIdX(), start); start = clock64(); - -// // load dist -// if (c_sm == 0 && f_sm == 0 && r_sm < rest_r_p - 2) { -// // printf("%d/%d load %f\n", r_sm, rest_r - 2, dratio_r[r + r_sm]); -// ratio_r_sm[r_sm] = dratio_r[r + r_sm]; -// // if (nr % 2 == 0 && (R/2) * 2 + 1 >= rest_r_p && r_sm == 0) { -// // ratio_r_sm[rest_r_p - 3] = 0.5; -// // } -// } -// if (r_sm == 0 && f_sm == 0 && c_sm < rest_c_p - 2) { -// ratio_c_sm[c_sm] = dratio_c[c + c_sm]; -// // if (nc % 2 == 0 && (C/2) * 2 + 1 >= rest_c_p && c_sm == 0) { -// // ratio_c_sm[rest_c_p - 3] = 0.5; -// // } -// } -// if (c_sm == 0 && r_sm == 0 && f_sm < rest_f_p - 2) { -// ratio_f_sm[f_sm] = dratio_f[f + f_sm]; -// // if (nf % 2 == 0 && (F/2) * 2 + 1 >= rest_f_p && f_sm == 0) { -// // ratio_f_sm[rest_f_p - 3] = 0.5; -// // } -// } - -// // if (r == 0 && c == 0 && f == 0 && r_sm == 0 && c_sm == 0 && f_sm == -// 0) -// // { -// // printf("ratio:"); -// // for (int i = 0; i < (R/2) * 2 + 1; i++) { -// // printf("%2.2f ", ratio_r_sm[i]); -// // } -// // printf("\n"); -// // } - -// } // restrict boundary - -// // asm volatile("membar.cta;"); -// // start = clock64() - start; -// // printf("[load ratio] block id %d,%d,%d elapsed %lu\n", -// FunctorBase::GetBlockIdZ(), -// // FunctorBase::GetBlockIdY(), -// FunctorBase::GetBlockIdX(), start); start = clock64(); - -// // __syncthreads(); -// // // debug print -// // if (debug) { -// // printf("in config: %d %d %d (%d %d %d)\n", (R/2), (C/2), (F/2), -// r,c,f); -// // printf("rest_p: %d %d %d\n", rest_r_p, rest_c_p, rest_f_p); -// // bool print = false; -// // for (int i = 0; i < (R/2) * 2 + 1; i++) { -// // for (int j = 0; j < (C/2) * 2 + 1; j++) { -// // for (int k = 0; k < (F/2) * 2 + 1; k++) { -// // // if (abs(v_sm[get_idx(ldsm1, ldsm2, i, j, k)]) > 10000) { -// // // print = true; -// // // printf("(block %d %d %d) %2.2f \n", r,c,f, -// // v_sm[get_idx(ldsm1, ldsm2, i, j, k)]); -// // // printf("%2.2f ", v_sm[get_idx(ldsm1, ldsm2, i, j, k)]); -// // // } -// // printf("%2.2f ", v_sm[get_idx(ldsm1, ldsm2, i, j, k)]); -// // } -// // printf("\n"); -// // } -// // printf("\n"); -// // } -// // } -// __syncthreads(); - -// if (dw && threadId < (R/2) * (C/2) * (F/2)) { -// r_sm = (threadId / ((C/2) * (F/2))) * 2; -// c_sm = ((threadId % ((C/2) * (F/2))) / (F/2)) * 2; -// f_sm = ((threadId % ((C/2) * (F/2))) % (F/2)) * 2; -// r_gl = r / 2 + threadId / ((C/2) * (F/2)); -// c_gl = c / 2 + threadId % ((C/2) * (F/2)) / (F/2); -// f_gl = f / 2 + threadId % ((C/2) * (F/2)) % (F/2); -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)]; -// if (TYPE == 1) { -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) { -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] = res; -// // printf("w-store: %d+(%d %d %d) <- %f (%d %d %d)\n", -// other_offset_w, -// // r_gl, c_gl, f_gl, dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)], -// // r_sm, c_sm, f_sm); -// } -// } else if (TYPE == 2) { -// f_gl *= 2; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf) { -// if (!skip) { -// if (INTERPOLATION) { -// ; -// } -// if (CALC_COEFF) { -// if (in_next && f_gl < nf_c) { -// ; -// } else { -// res -= dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)]; -// } -// } -// } -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] = res; -// // printf("w-store: %d+(%d %d %d) <- %f (%d %d %d)\n", -// other_offset_w, -// // r_gl, c_gl, f_gl, dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)], -// // r_sm, c_sm, f_sm); -// } -// } -// } - -// // asm volatile("membar.cta;"); -// // start = clock64() - start; -// // printf("[store coarse] block id %d,%d,%d elapsed %lu\n", -// FunctorBase::GetBlockIdZ(), -// // FunctorBase::GetBlockIdY(), -// FunctorBase::GetBlockIdX(), start); start = clock64(); int -// base = 0; -// // printf("TYPE =%d \n", TYPE); -// // printf("%d == %d && %llu >= %d && %llu < %d\n", r + (R/2) * 2, nr_p - -// 1, -// // threadId, base, threadId, base + (C/2) * (F/2)); - -// if (dw && r + (R/2) * 2 == nr_p - 1 && threadId >= base && -// threadId < base + (C/2) * (F/2)) { -// r_sm = (R/2) * 2; -// c_sm = ((threadId - base) / (F/2)) * 2; -// f_sm = ((threadId - base) % (F/2)) * 2; -// r_gl = r / 2 + (R/2); -// c_gl = c / 2 + (threadId - base) / (F/2); -// f_gl = f / 2 + (threadId - base) % (F/2); -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)]; -// if (TYPE == 1) { -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) { -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] = res; -// // printf("w-store: %d+(%d %d %d) <- %f (%d %d %d)\n", -// other_offset_w, -// // r_gl, c_gl, f_gl, dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)], -// // r_sm, c_sm, f_sm); -// } -// } else if (TYPE == 2) { -// f_gl *= 2; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf) { -// if (!skip) { -// if (INTERPOLATION) { -// ; -// } -// if (CALC_COEFF) { -// if (in_next && f_gl < nf_c) { -// ; -// } else { -// res -= dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)]; -// } -// } -// } -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] = res; -// // printf("w-store: %d+(%d %d %d) <- %f (%d %d %d)\n", -// other_offset_w, -// // r_gl, c_gl, f_gl, dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)], -// // r_sm, c_sm, f_sm); -// } -// } -// } - -// base += (C/2) * (F/2); // ROUND_UP_WARP((C/2) * (F/2)) * WARP_SIZE; -// if (dw && c + (C/2) * 2 == nc_p - 1 && threadId >= base && -// threadId < base + (R/2) * (F/2)) { -// r_sm = ((threadId - base) / (F/2)) * 2; -// c_sm = (C/2) * 2; -// f_sm = ((threadId - base) % (F/2)) * 2; -// r_gl = r / 2 + (threadId - base) / (F/2); -// c_gl = c / 2 + (C/2); -// f_gl = f / 2 + (threadId - base) % (F/2); -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)]; -// if (TYPE == 1) { -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) { -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] = res; -// // printf("w-store: %d+(%d %d %d) <- %f (%d %d %d)\n", -// other_offset_w, -// // r_gl, c_gl, f_gl, dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)], -// // r_sm, c_sm, f_sm); -// } -// } else if (TYPE == 2) { -// f_gl *= 2; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf) { -// if (!skip) { -// if (INTERPOLATION) { -// ; -// } -// if (CALC_COEFF) { -// if (in_next && f_gl < nf_c) { -// ; -// } else { -// res -= dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)]; -// } -// } -// } -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] = res; -// // printf("w-store: %d+(%d %d %d) <- %f (%d %d %d)\n", -// other_offset_w, -// // r_gl, c_gl, f_gl, dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)], -// // r_sm, c_sm, f_sm); -// } -// } -// // printf("(%d %d %d) (%d %d %d) %f\n", -// // r_sm, c_sm, f_sm, r_gl, c_gl, f_gl, dwork[get_idx(lddv1, -// lddv2, -// // r_gl, c_gl, f_gl)]); -// } - -// base += (R/2) * (F/2); // ROUND_UP_WARP((R/2) * (F/2)) * WARP_SIZE; -// // printf("%d %d\n", base, threadId); -// if (dw && f + (F/2) * 2 == nf_p - 1 && threadId >= base && -// threadId < base + (R/2) * (C/2)) { -// r_sm = ((threadId - base) / (C/2)) * 2; -// c_sm = ((threadId - base) % (C/2)) * 2; -// f_sm = (F/2) * 2; -// r_gl = r / 2 + (threadId - base) / (C/2); -// c_gl = c / 2 + (threadId - base) % (C/2); -// f_gl = f / 2 + (F/2); -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)]; -// if (TYPE == 1) { -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) { -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] = res; -// // printf("w-store: %d+(%d %d %d) <- %f (%d %d %d)\n", -// other_offset_w, -// // r_gl, c_gl, f_gl, dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)], -// // r_sm, c_sm, f_sm); -// } -// } else if (TYPE == 2) { -// f_gl *= 2; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf) { -// if (!skip) { -// if (INTERPOLATION) { -// ; -// } -// if (CALC_COEFF) { -// if (in_next && f_gl < nf_c) { -// ; -// } else { -// res -= dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)]; -// } -// } -// } -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] = res; -// // printf("w-store: %d+(%d %d %d) <- %f (%d %d %d)\n", -// other_offset_w, -// // r_gl, c_gl, f_gl, dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)], -// // r_sm, c_sm, f_sm); -// } -// } -// } - -// base += (R/2) * (C/2); // ROUND_UP_WARP((R/2) * (C/2)) * WARP_SIZE; -// // load extra edges -// if (dw && c + (C/2) * 2 == nc_p - 1 && f + (F/2) * 2 == nf_p - 1 && -// threadId >= base && threadId < base + (R/2)) { -// r_sm = (threadId - base) * 2; -// c_sm = (C/2) * 2; -// f_sm = (F/2) * 2; -// r_gl = r / 2 + threadId - base; -// c_gl = c / 2 + (C/2); -// f_gl = f / 2 + (F/2); -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)]; -// if (TYPE == 1) { -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) { -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] = res; -// // printf("w-store: %d+(%d %d %d) <- %f (%d %d %d)\n", -// other_offset_w, -// // r_gl, c_gl, f_gl, dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)], -// // r_sm, c_sm, f_sm); -// } -// } else if (TYPE == 2) { -// f_gl *= 2; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf) { -// if (!skip) { -// if (INTERPOLATION) { -// ; -// } -// if (CALC_COEFF) { -// if (in_next && f_gl < nf_c) { -// ; -// } else { -// res -= dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)]; -// } -// } -// } -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] = res; -// // printf("w-store: %d+(%d %d %d) <- %f (%d %d %d)\n", -// other_offset_w, -// // r_gl, c_gl, f_gl, dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)], -// // r_sm, c_sm, f_sm); -// } -// } -// } - -// base += (R/2); // ROUND_UP_WARP((R/2)) * WARP_SIZE; -// // if (TYPE == 2) printf("%d %d, %d, %llu, %d\n",dw == NULL, f + (F/2) * -// 2, nf_p -// // - 1, threadId, (C/2)); -// if (dw && r + (R/2) * 2 == nr_p - 1 && f + (F/2) * 2 == nf_p - 1 && -// threadId >= base && threadId < base + (C/2)) { -// r_sm = (R/2) * 2; -// c_sm = (threadId - base) * 2; -// f_sm = (F/2) * 2; -// r_gl = r / 2 + (R/2); -// c_gl = c / 2 + threadId - base; -// f_gl = f / 2 + (F/2); -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)]; -// if (TYPE == 1) { -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) { -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] = res; -// // printf("w-store: %d+(%d %d %d) <- %f (%d %d %d)\n", -// other_offset_w, -// // r_gl, c_gl, f_gl, dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)], -// // r_sm, c_sm, f_sm); -// } -// } else if (TYPE == 2) { -// f_gl *= 2; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf) { -// if (!skip) { -// if (INTERPOLATION) { -// ; -// } -// if (CALC_COEFF) { -// if (in_next && f_gl < nf_c) { -// ; -// } else { -// res -= dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)]; -// } -// } -// } -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] = res; -// // printf("w-store: %d+(%d %d %d) <- %f (%d %d %d)\n", -// other_offset_w, -// // r_gl, c_gl, f_gl, dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)], -// // r_sm, c_sm, f_sm); -// } -// } -// // printf("store[%d %d %d]: %f\n", r_sm, c_sm, f_sm, -// v_sm[get_idx(ldsm1, -// // ldsm2, r_sm, c_sm, f_sm)]); -// } - -// base += (C/2); // ROUND_UP_WARP((C/2)) * WARP_SIZE; -// if (dw && r + (R/2) * 2 == nr_p - 1 && c + (C/2) * 2 == nc_p - 1 && -// threadId >= base && threadId < base + (F/2)) { -// r_sm = (R/2) * 2; -// c_sm = (C/2) * 2; -// f_sm = (threadId - base) * 2; -// r_gl = r / 2 + (R/2); -// c_gl = c / 2 + (C/2); -// f_gl = f / 2 + threadId - base; -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)]; -// if (TYPE == 1) { -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) { -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] = res; -// // printf("w-store: %d+(%d %d %d) <- %f (%d %d %d)\n", -// other_offset_w, -// // r_gl, c_gl, f_gl, dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)], -// // r_sm, c_sm, f_sm); -// } -// } else if (TYPE == 2) { -// f_gl *= 2; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf) { -// if (!skip) { -// if (INTERPOLATION) { -// ; -// } -// if (CALC_COEFF) { -// if (in_next && f_gl < nf_c) { -// ; -// } else { -// res -= dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)]; -// } -// } -// } -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] = res; -// // printf("w-store: %d+(%d %d %d) <- %f (%d %d %d)\n", -// other_offset_w, -// // r_gl, c_gl, f_gl, dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)], -// // r_sm, c_sm, f_sm); -// } -// } -// } -// base += (F/2); // ROUND_UP_WARP((F/2)) * WARP_SIZE; -// // // load extra vertex -// if (dw && r + (R/2) * 2 == nr_p - 1 && c + (C/2) * 2 == nc_p - 1 && -// f + (F/2) * 2 == nf_p - 1 && threadId >= base && threadId < base + 1) -// { -// r_sm = (R/2) * 2; -// c_sm = (C/2) * 2; -// f_sm = (F/2) * 2; -// r_gl = r / 2 + (R/2); -// c_gl = c / 2 + (C/2); -// f_gl = f / 2 + (F/2); -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)]; -// if (TYPE == 1) { -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) { -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] = res; -// // printf("w-store: %d+(%d %d %d) <- %f (%d %d %d)\n", -// other_offset_w, -// // r_gl, c_gl, f_gl, dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)], -// // r_sm, c_sm, f_sm); -// } -// } else if (TYPE == 2) { -// f_gl *= 2; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf) { -// if (!skip) { -// if (INTERPOLATION) { -// ; -// } -// if (CALC_COEFF) { -// if (in_next && f_gl < nf_c) { -// ; -// } else { -// res -= dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)]; -// } -// } -// } -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] = res; -// // printf("w-store: %d+(%d %d %d) <- %f (%d %d %d)\n", -// other_offset_w, -// // r_gl, c_gl, f_gl, dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)], -// // r_sm, c_sm, f_sm); -// } -// } -// } - -// // asm volatile("membar.cta;"); -// // start = clock64() - start; -// // printf("[store extra] block id %d,%d,%d elapsed %lu\n", -// FunctorBase::GetBlockIdZ(), -// // FunctorBase::GetBlockIdY(), -// FunctorBase::GetBlockIdX(), start); start = clock64(); - -// // start = clock64(); - -// if (dwf && threadId >= (R/2) * (C/2) * (F/2) && threadId < (R/2) * (C/2) -// * (F/2) * 2) { -// r_sm = ((threadId - (R/2) * (C/2) * (F/2)) / ((C/2) * (F/2))) * 2; -// c_sm = (((threadId - (R/2) * (C/2) * (F/2)) % ((C/2) * (F/2))) / (F/2)) -// * 2; f_sm = (((threadId - (R/2) * (C/2) * (F/2)) % ((C/2) * (F/2))) % -// (F/2)) * 2 + 1; r_gl = r / 2 + (threadId - (R/2) * (C/2) * (F/2)) / -// ((C/2) * (F/2)); c_gl = c / 2 + ((threadId - (R/2) * (C/2) * (F/2)) % -// ((C/2) * (F/2))) / (F/2); f_gl = f / 2 + ((threadId - (R/2) * (C/2) * -// (F/2)) % ((C/2) * (F/2))) % (F/2); res = v_sm[get_idx(ldsm1, ldsm2, -// r_sm, c_sm, f_sm)]; if (TYPE == 1) { -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf - nf_c) { -// if (!skip) { -// if (INTERPOLATION) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm - 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm + 1)], -// ratio_f_sm[f_sm - 1]); -// } -// if (INTERPOLATION && CALC_COEFF) { // fused -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res; -// } -// if (!INTERPOLATION && CALC_COEFF) { // calc_coeff only -// res -= dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)]; -// } -// } -// dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)] = res; -// } -// } else if (TYPE == 2) { -// f_gl = 2 * f_gl + 1; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf) { -// if (!skip) { -// if (INTERPOLATION) { -// ; -// } -// if (CALC_COEFF) { -// if (in_next && f_gl < nf_c) { -// ; -// } else { -// res -= dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)]; -// } -// } -// } -// dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)] = res; -// } -// } - -// // if (nr == 70) printf("f-store: (%d %d %d) <- %f (%d %d %d)\n", r_gl, -// // c_gl, f_gl, v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)], r_sm, -// c_sm, -// // f_sm); -// // asm volatile("membar.cta;"); -// // start = clock64() - start; -// // printf("[(F/2)-store] block id %d,%d,%d elapsed %lu\n", -// FunctorBase::GetBlockIdZ(), -// // FunctorBase::GetBlockIdY(), -// FunctorBase::GetBlockIdX(), start); start = clock64(); -// } -// // asm volatile("membar.cta;"); -// // start = clock64() - start; -// // printf("[(F/2)-store] block id %d,%d,%d elapsed %lu\n", -// FunctorBase::GetBlockIdZ(), -// // FunctorBase::GetBlockIdY(), -// FunctorBase::GetBlockIdX(), start); start = clock64(); - -// // if (r_sm % 2 == 0 && c_sm % 2 != 0 && f_sm % 2 == 0) { - -// if (dwc && threadId >= (R/2) * (C/2) * (F/2) * 2 && threadId < (R/2) * -// (C/2) * (F/2) * 3) { -// r_sm = ((threadId - (R/2) * (C/2) * (F/2) * 2) / ((C/2) * (F/2))) * 2; -// c_sm = (((threadId - (R/2) * (C/2) * (F/2) * 2) % ((C/2) * (F/2))) / -// (F/2)) * 2 + 1; f_sm = (((threadId - (R/2) * (C/2) * (F/2) * 2) % -// ((C/2) * (F/2))) % (F/2)) * 2; r_gl = r / 2 + (threadId - (R/2) * (C/2) -// * (F/2) * 2) / ((C/2) * (F/2)); c_gl = c / 2 + ((threadId - (R/2) * -// (C/2) * (F/2) * 2) % ((C/2) * (F/2))) / (F/2); f_gl = f / 2 + -// ((threadId - (R/2) * (C/2) * (F/2) * 2) % ((C/2) * (F/2))) % (F/2); res -// = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)]; if (TYPE == 1) { -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf_c) { -// if (!skip) { -// if (INTERPOLATION) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm)], -// ratio_c_sm[c_sm - 1]); -// } -// if (INTERPOLATION && CALC_COEFF) { -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res; -// } -// if (!INTERPOLATION && CALC_COEFF) { -// res -= dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)]; -// } -// } -// dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)] = res; -// } -// } else if (TYPE == 2) { -// f_gl *= 2; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf) { -// if (!skip) { -// if (INTERPOLATION) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm)], -// ratio_c_sm[c_sm - 1]); -// } -// if (CALC_COEFF) { // no need to test in_next -// res -= dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)]; -// } -// } -// dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)] = res; -// } -// // if (nr == 70) printf("c-store: (%d %d %d) <- %f (%d %d %d)\n", -// r_gl, -// // c_gl, f_gl, v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)], r_sm, -// // c_sm, f_sm); -// } -// } - -// // asm volatile("membar.cta;"); -// // start = clock64() - start; -// // printf("[(C/2)-store] block id %d,%d,%d elapsed %lu\n", -// FunctorBase::GetBlockIdZ(), -// // FunctorBase::GetBlockIdY(), -// FunctorBase::GetBlockIdX(), start); start = clock64(); - -// // if (r_sm % 2 != 0 && c_sm % 2 == 0 && f_sm % 2 == 0) { -// if (dwr && threadId >= (R/2) * (C/2) * (F/2) * 3 && threadId < (R/2) * -// (C/2) * (F/2) * 4) { -// r_sm = ((threadId - (R/2) * (C/2) * (F/2) * 3) / ((C/2) * (F/2))) * 2 + -// 1; c_sm = (((threadId - (R/2) * (C/2) * (F/2) * 3) % ((C/2) * (F/2))) / -// (F/2)) * 2; f_sm = (((threadId - (R/2) * (C/2) * (F/2) * 3) % ((C/2) * -// (F/2))) % (F/2)) * 2; r_gl = r / 2 + (threadId - (R/2) * (C/2) * (F/2) -// * 3) / ((C/2) * (F/2)); c_gl = c / 2 + ((threadId - (R/2) * (C/2) * -// (F/2) * 3) % ((C/2) * (F/2))) / (F/2); f_gl = f / 2 + ((threadId - -// (R/2) * (C/2) * (F/2) * 3) % ((C/2) * (F/2))) % (F/2); res = -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)]; if (TYPE == 1) { -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf_c) { -// if (!skip) { -// if (INTERPOLATION) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm)], -// ratio_r_sm[r_sm - 1]); -// } -// if (INTERPOLATION && CALC_COEFF) { -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res; -// } -// if (!INTERPOLATION && CALC_COEFF) { -// res -= dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)]; -// } -// } -// dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)] = res; -// } -// } else if (TYPE == 2) { -// f_gl *= 2; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf) { -// if (!skip) { -// if (INTERPOLATION) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm)], -// ratio_r_sm[r_sm - 1]); -// } -// if (CALC_COEFF) { // no need to test if in_next -// res -= dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)]; -// } -// } -// dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)] = res; -// } -// } -// } - -// // asm volatile("membar.cta;"); -// // start = clock64() - start; -// // printf("[(R/2)-store] block id %d,%d,%d elapsed %lu\n", -// FunctorBase::GetBlockIdZ(), -// // FunctorBase::GetBlockIdY(), -// FunctorBase::GetBlockIdX(), start); start = clock64(); -// __syncthreads(); -// if (dwcf && threadId >= (R/2) * (C/2) * (F/2) * 4 && threadId < (R/2) * -// (C/2) * (F/2) * 5) { -// r_sm = ((threadId - (R/2) * (C/2) * (F/2) * 4) / ((C/2) * (F/2))) * 2; -// c_sm = (((threadId - (R/2) * (C/2) * (F/2) * 4) % ((C/2) * (F/2))) / -// (F/2)) * 2 + 1; f_sm = (((threadId - (R/2) * (C/2) * (F/2) * 4) % -// ((C/2) * (F/2))) % (F/2)) * 2 + 1; r_gl = r / 2 + (threadId - (R/2) * -// (C/2) * (F/2) * 4) / ((C/2) * (F/2)); c_gl = c / 2 + ((threadId - (R/2) -// * (C/2) * (F/2) * 4) % ((C/2) * (F/2))) / (F/2); f_gl = f / 2 + -// ((threadId - (R/2) * (C/2) * (F/2) * 4) % ((C/2) * (F/2))) % (F/2); res -// = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)]; if (TYPE == 1) { -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf - nf_c) { -// if (!skip) { -// if (INTERPOLATION) { -// T f1 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm - -// 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm + -// 1)], ratio_f_sm[f_sm - 1]); -// T f2 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm - -// 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm + -// 1)], ratio_f_sm[f_sm - 1]); -// T tmp = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)]; -// res = lerp(f1, f2, ratio_c_sm[c_sm - 1]); -// } -// if (INTERPOLATION && CALC_COEFF) { -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res; -// } -// if (!INTERPOLATION && CALC_COEFF) { -// res -= dwcf[get_idx(lddwcf1, lddwcf2, r_gl, c_gl, f_gl)]; -// } -// } -// dwcf[get_idx(lddwcf1, lddwcf2, r_gl, c_gl, f_gl)] = res; -// } -// } else if (TYPE == 2) { -// f_gl = 2 * f_gl + 1; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf) { -// if (!skip) { -// if (INTERPOLATION) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm)], -// ratio_c_sm[c_sm - 1]); -// } -// if (CALC_COEFF) { // not need to test if in_next -// res -= dwcf[get_idx(lddwcf1, lddwcf2, r_gl, c_gl, f_gl)]; -// } -// } -// dwcf[get_idx(lddwcf1, lddwcf2, r_gl, c_gl, f_gl)] = res; -// } -// } -// } - -// // asm volatile("membar.cta;"); -// // start = clock64() - start; -// // printf("[CF-store] block id %d,%d,%d elapsed %lu\n", -// FunctorBase::GetBlockIdZ(), -// // FunctorBase::GetBlockIdY(), -// FunctorBase::GetBlockIdX(), start); start = clock64(); - -// if (dwrf && threadId >= (R/2) * (C/2) * (F/2) * 5 && threadId < (R/2) * -// (C/2) * (F/2) * 6) { -// r_sm = ((threadId - (R/2) * (C/2) * (F/2) * 5) / ((C/2) * (F/2))) * 2 + -// 1; c_sm = (((threadId - (R/2) * (C/2) * (F/2) * 5) % ((C/2) * (F/2))) / -// (F/2)) * 2; f_sm = (((threadId - (R/2) * (C/2) * (F/2) * 5) % ((C/2) * -// (F/2))) % (F/2)) * 2 + 1; r_gl = r / 2 + (threadId - (R/2) * (C/2) * -// (F/2) * 5) / ((C/2) * (F/2)); c_gl = c / 2 + ((threadId - (R/2) * (C/2) -// * (F/2) * 5) % ((C/2) * (F/2))) / (F/2); f_gl = f / 2 + ((threadId - -// (R/2) * (C/2) * (F/2) * 5) % ((C/2) * (F/2))) % (F/2); res = -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)]; if (TYPE == 1) { -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf - nf_c) { -// if (!skip) { -// if (INTERPOLATION) { -// T f1 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm - -// 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm + -// 1)], ratio_f_sm[f_sm - 1]); -// T f2 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm - -// 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm + -// 1)], ratio_f_sm[f_sm - 1]); -// res = lerp(f1, f2, ratio_r_sm[r_sm - 1]); -// } -// if (INTERPOLATION && CALC_COEFF) { -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res; -// } -// if (!INTERPOLATION && CALC_COEFF) { -// res -= dwrf[get_idx(lddwrf1, lddwrf2, r_gl, c_gl, f_gl)]; -// } -// } -// dwrf[get_idx(lddwrf1, lddwrf2, r_gl, c_gl, f_gl)] = res; -// } -// } else if (TYPE == 2) { -// f_gl = 2 * f_gl + 1; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf) { -// if (!skip) { -// if (INTERPOLATION) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm)], -// ratio_r_sm[r_sm - 1]); -// } -// if (CALC_COEFF) { // no need to test if in_next -// res -= dwrf[get_idx(lddwrf1, lddwrf2, r_gl, c_gl, f_gl)]; -// } -// } -// dwrf[get_idx(lddwrf1, lddwrf2, r_gl, c_gl, f_gl)] = res; -// } -// } -// } - -// if (dwrc && threadId >= (R/2) * (C/2) * (F/2) * 6 && threadId < (R/2) * -// (C/2) * (F/2) * 7) { -// r_sm = ((threadId - (R/2) * (C/2) * (F/2) * 6) / ((C/2) * (F/2))) * 2 + -// 1; c_sm = (((threadId - (R/2) * (C/2) * (F/2) * 6) % ((C/2) * (F/2))) / -// (F/2)) * 2 + 1; f_sm = (((threadId - (R/2) * (C/2) * (F/2) * 6) % -// ((C/2) * (F/2))) % (F/2)) * 2; r_gl = r / 2 + (threadId - (R/2) * (C/2) -// * (F/2) * 6) / ((C/2) * (F/2)); c_gl = c / 2 + ((threadId - (R/2) * -// (C/2) * (F/2) * 6) % ((C/2) * (F/2))) / (F/2); f_gl = f / 2 + -// ((threadId - (R/2) * (C/2) * (F/2) * 6) % ((C/2) * (F/2))) % (F/2); res -// = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)]; if (TYPE == 1) { -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr - nr_c && c_gl < nc - nc_c && f_gl < nf_c) { -// if (!skip) { -// if (INTERPOLATION) { -// T c1 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm - 1, -// f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm + 1, -// f_sm)], ratio_c_sm[c_sm - 1]); -// T c2 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm - 1, -// f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm + 1, -// f_sm)], ratio_c_sm[c_sm - 1]); -// res = lerp(c1, c2, ratio_r_sm[r_sm - 1]); -// } -// if (INTERPOLATION && CALC_COEFF) { -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res; -// } -// if (!INTERPOLATION && CALC_COEFF) { -// res -= dwrc[get_idx(lddwrc1, lddwrc2, r_gl, c_gl, f_gl)]; -// } -// } -// dwrc[get_idx(lddwrc1, lddwrc2, r_gl, c_gl, f_gl)] = res; -// } -// } else if (TYPE == 2) { -// f_gl *= 2; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr - nr_c && c_gl < nc - nc_c && f_gl < nf) { -// if (!skip) { -// if (INTERPOLATION) { -// T c1 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm - 1, -// f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm + 1, -// f_sm)], ratio_c_sm[c_sm - 1]); -// T c2 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm - 1, -// f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm + 1, -// f_sm)], ratio_c_sm[c_sm - 1]); -// res = lerp(c1, c2, ratio_r_sm[r_sm - 1]); -// } -// if (CALC_COEFF) { // no need to test if in_next -// res -= dwrc[get_idx(lddwrc1, lddwrc2, r_gl, c_gl, f_gl)]; -// } -// } -// dwrc[get_idx(lddwrc1, lddwrc2, r_gl, c_gl, f_gl)] = res; -// } -// } -// } - -// if (dwrcf && threadId >= (R/2) * (C/2) * (F/2) * 7 && threadId < (R/2) * -// (C/2) * (F/2) * 8) { -// r_sm = ((threadId - (R/2) * (C/2) * (F/2) * 7) / ((C/2) * (F/2))) * 2 + -// 1; c_sm = (((threadId - (R/2) * (C/2) * (F/2) * 7) % ((C/2) * (F/2))) / -// (F/2)) * 2 + 1; f_sm = (((threadId - (R/2) * (C/2) * (F/2) * 7) % -// ((C/2) * (F/2))) % (F/2)) * 2 + 1; r_gl = r / 2 + (threadId - (R/2) * -// (C/2) * (F/2) * 7) / ((C/2) * (F/2)); c_gl = c / 2 + ((threadId - (R/2) -// * (C/2) * (F/2) * 7) % ((C/2) * (F/2))) / (F/2); f_gl = f / 2 + -// ((threadId - (R/2) * (C/2) * (F/2) * 7) % ((C/2) * (F/2))) % (F/2); res -// = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)]; if (TYPE == 1) { -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr - nr_c && c_gl < nc - nc_c && f_gl < nf - nf_c) { -// if (!skip) { -// if (INTERPOLATION) { -// T f1 = lerp( -// v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm - 1, f_sm - 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm - 1, f_sm + 1)], -// ratio_f_sm[f_sm - 1]); -// T f2 = lerp( -// v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm + 1, f_sm - 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm + 1, f_sm + 1)], -// ratio_f_sm[f_sm - 1]); -// T f3 = lerp( -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm - 1, f_sm - 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm - 1, f_sm + 1)], -// ratio_f_sm[f_sm - 1]); -// T f4 = lerp( -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm + 1, f_sm - 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm + 1, f_sm + 1)], -// ratio_f_sm[f_sm - 1]); - -// T fc1 = lerp(f1, f2, ratio_c_sm[c_sm - 1]); -// T fc2 = lerp(f3, f4, ratio_c_sm[c_sm - 1]); - -// res = lerp(fc1, fc2, ratio_r_sm[r_sm - 1]); -// } -// if (INTERPOLATION && CALC_COEFF) { -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res; -// } -// if (!INTERPOLATION && CALC_COEFF) { -// res -= dwrcf[get_idx(lddwrcf1, lddwrcf2, r_gl, c_gl, f_gl)]; -// } -// } -// dwrcf[get_idx(lddwrcf1, lddwrcf2, r_gl, c_gl, f_gl)] = res; -// } -// } else if (TYPE == 2) { -// f_gl = 2 * f_gl + 1; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr - nr_c && c_gl < nc - nc_c && f_gl < nf) { -// if (!skip) { -// if (INTERPOLATION) { -// T c1 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm - 1, -// f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm + 1, -// f_sm)], ratio_c_sm[c_sm - 1]); -// T c2 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm - 1, -// f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm + 1, -// f_sm)], ratio_c_sm[c_sm - 1]); -// res = lerp(c1, c2, ratio_r_sm[r_sm - 1]); -// } -// if (CALC_COEFF) { // no need to test if in_next -// res -= dwrcf[get_idx(lddwrcf1, lddwrcf2, r_gl, c_gl, f_gl)]; -// } -// } -// dwrcf[get_idx(lddwrcf1, lddwrcf2, r_gl, c_gl, f_gl)] = res; -// } -// } -// } -// // end = clock64(); - -// // asm volatile("membar.cta;"); -// // if (threadId < 256 && FunctorBase::GetBlockIdZ() == 0 && -// FunctorBase::GetBlockIdY() == 0 && -// FunctorBase::GetBlockIdX() == -// // 0) printf("threadId %d elapsed %lu\n", threadId, end-start); -// if (r + (R/2) * 2 == nr_p - 1) { -// // printf("test\n"); -// if (threadId < (C/2) * (F/2)) { -// // printf("test1\n"); -// if (dwf) { -// // printf("test2\n"); -// r_sm = (R/2) * 2; -// c_sm = (threadId / (F/2)) * 2; -// f_sm = (threadId % (F/2)) * 2 + 1; -// r_gl = r / 2 + (R/2); -// c_gl = c / 2 + threadId / (F/2); -// f_gl = f / 2 + threadId % (F/2); -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)]; -// if (TYPE == 1) { -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf - nf_c) { -// // printf("test3\n"); -// if (!skip) { -// if (INTERPOLATION) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm - -// 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm + -// 1)], ratio_f_sm[f_sm - 1]); -// } -// if (INTERPOLATION && CALC_COEFF) { -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res; -// } -// if (!INTERPOLATION && CALC_COEFF) { -// res -= dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)]; -// } -// } -// // printf("dwf (%d %d %d): %f\n", r_gl, c_gl, f_gl, res); -// dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)] = res; -// } -// } else if (TYPE == 2) { -// f_gl = 2 * f_gl + 1; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf) { -// if (!skip) { -// if (INTERPOLATION) { -// ; -// } -// if (CALC_COEFF) { // need to test if in_next -// if (in_next && f_gl < nf_c) { -// ; -// } // in_next -// else { -// res -= dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)]; -// } -// } -// } -// dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)] = res; -// } -// } -// } - -// if (dwc) { -// r_sm = (R/2) * 2; -// c_sm = (threadId / (F/2)) * 2 + 1; -// f_sm = (threadId % (F/2)) * 2; -// r_gl = r / 2 + (R/2); -// c_gl = c / 2 + threadId / (F/2); -// f_gl = f / 2 + threadId % (F/2); -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)]; -// if (TYPE == 1) { -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf_c) { -// if (!skip) { -// if (INTERPOLATION) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, -// f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, -// f_sm)], ratio_c_sm[c_sm - 1]); -// } -// if (INTERPOLATION && CALC_COEFF) { -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res; -// } -// if (!INTERPOLATION && CALC_COEFF) { -// res -= dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)]; -// } -// } -// dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)] = res; -// } -// } else if (TYPE == 2) { -// f_gl *= 2; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf) { -// if (!skip) { -// if (INTERPOLATION) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, -// f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, -// f_sm)], ratio_c_sm[c_sm - 1]); -// } -// if (CALC_COEFF) { // no need to test if in_next -// res -= dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)]; -// } -// } -// dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)] = res; -// } -// } -// } - -// // printf("(%d %d %d) (%d %d %d) %f\n", -// // r_sm, c_sm, f_sm, r_gl, c_gl, f_gl, v_sm[get_idx(ldsm1, -// // ldsm2, r_sm, c_sm, f_sm)]); -// if (dwcf) { -// r_sm = (R/2) * 2; -// c_sm = (threadId / (F/2)) * 2 + 1; -// f_sm = (threadId % (F/2)) * 2 + 1; -// r_gl = r / 2 + (R/2); -// c_gl = c / 2 + threadId / (F/2); -// f_gl = f / 2 + threadId % (F/2); -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)]; -// if (TYPE == 1) { -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf - nf_c) { -// if (!skip) { -// if (INTERPOLATION) { -// T f1 = lerp( -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm - 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm + 1)], -// ratio_f_sm[f_sm - 1]); -// T f2 = lerp( -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm - 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm + 1)], -// ratio_f_sm[f_sm - 1]); -// res = lerp(f1, f2, ratio_c_sm[c_sm - 1]); -// } -// if (INTERPOLATION && CALC_COEFF) { -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res; -// } -// if (!INTERPOLATION && CALC_COEFF) { -// res -= dwcf[get_idx(lddwcf1, lddwcf2, r_gl, c_gl, f_gl)]; -// } -// } -// dwcf[get_idx(lddwcf1, lddwcf2, r_gl, c_gl, f_gl)] = res; -// } -// } else if (TYPE == 2) { -// f_gl = 2 * f_gl + 1; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf) { -// if (!skip) { -// if (INTERPOLATION) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, -// f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, -// f_sm)], ratio_c_sm[c_sm - 1]); -// } -// if (CALC_COEFF) { -// res -= dwcf[get_idx(lddwcf1, lddwcf2, r_gl, c_gl, f_gl)]; -// } -// } -// dwcf[get_idx(lddwcf1, lddwcf2, r_gl, c_gl, f_gl)] = res; -// } -// } -// } -// } -// } - -// if (c + (C/2) * 2 == nc_p - 1) { -// if (threadId >= (R/2) * (C/2) * (F/2) && threadId < (R/2) * (C/2) * -// (F/2) + (R/2) * (F/2)) { -// if (dwf) { -// r_sm = ((threadId - (R/2) * (C/2) * (F/2)) / (F/2)) * 2; -// c_sm = (C/2) * 2; -// f_sm = ((threadId - (R/2) * (C/2) * (F/2)) % (F/2)) * 2 + 1; -// r_gl = r / 2 + (threadId - (R/2) * (C/2) * (F/2)) / (F/2); -// c_gl = c / 2 + (C/2); -// f_gl = f / 2 + (threadId - (R/2) * (C/2) * (F/2)) % (F/2); -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)]; -// if (TYPE == 1) { -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf - nf_c) { -// if (!skip) { -// if (INTERPOLATION) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm - -// 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm + -// 1)], ratio_f_sm[f_sm - 1]); -// } -// if (INTERPOLATION && CALC_COEFF) { -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res; -// } -// if (!INTERPOLATION && CALC_COEFF) { -// res -= dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)]; -// } -// } -// dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)] = res; -// } -// } else if (TYPE == 2) { -// f_gl = 2 * f_gl + 1; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf) { -// if (!skip) { -// if (INTERPOLATION) { -// ; -// } -// if (CALC_COEFF) { // need to test if in_next -// if (in_next && f_gl < nf_c) { -// ; -// } // in_next -// else { -// res -= dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)]; -// } -// } -// } -// dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)] = res; -// } -// } -// } - -// if (dwr) { -// r_sm = ((threadId - (R/2) * (C/2) * (F/2)) / (F/2)) * 2 + 1; -// c_sm = (C/2) * 2; -// f_sm = ((threadId - (R/2) * (C/2) * (F/2)) % (F/2)) * 2; -// r_gl = r / 2 + (threadId - (R/2) * (C/2) * (F/2)) / (F/2); -// c_gl = c / 2 + (C/2); -// f_gl = f / 2 + (threadId - (R/2) * (C/2) * (F/2)) % (F/2); -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)]; -// if (TYPE == 1) { -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf_c) { -// if (!skip) { -// if (INTERPOLATION) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, -// f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, -// f_sm)], ratio_r_sm[r_sm - 1]); -// } -// if (INTERPOLATION && CALC_COEFF) { -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res; -// } -// if (!INTERPOLATION && CALC_COEFF) { -// res -= dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)]; -// } -// } -// dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)] = res; -// } -// } else if (TYPE == 2) { -// f_gl *= 2; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf) { -// if (!skip) { -// if (INTERPOLATION) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, -// f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, -// f_sm)], ratio_r_sm[r_sm - 1]); -// } -// if (CALC_COEFF) { -// res -= dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)]; -// } -// } -// dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)] = res; -// } -// } -// } - -// if (dwrf) { -// r_sm = ((threadId - (R/2) * (C/2) * (F/2)) / (F/2)) * 2 + 1; -// c_sm = (C/2) * 2; -// f_sm = ((threadId - (R/2) * (C/2) * (F/2)) % (F/2)) * 2 + 1; -// r_gl = r / 2 + (threadId - (R/2) * (C/2) * (F/2)) / (F/2); -// c_gl = c / 2 + (C/2); -// f_gl = f / 2 + (threadId - (R/2) * (C/2) * (F/2)) % (F/2); -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)]; -// if (TYPE == 1) { -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf - nf_c) { -// if (!skip) { -// if (INTERPOLATION) { -// T f1 = lerp( -// v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm - 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm + 1)], -// ratio_f_sm[f_sm - 1]); -// T f2 = lerp( -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm - 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm + 1)], -// ratio_f_sm[f_sm - 1]); -// res = lerp(f1, f2, ratio_r_sm[r_sm - 1]); -// } -// if (INTERPOLATION && CALC_COEFF) { -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res; -// } -// if (!INTERPOLATION && CALC_COEFF) { -// res -= dwrf[get_idx(lddwrf1, lddwrf2, r_gl, c_gl, f_gl)]; -// } -// } -// dwrf[get_idx(lddwrf1, lddwrf2, r_gl, c_gl, f_gl)] = res; -// } -// } else if (TYPE == 2) { -// f_gl = 2 * f_gl + 1; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf) { -// if (!skip) { -// if (INTERPOLATION) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, -// f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, -// f_sm)], ratio_r_sm[r_sm - 1]); -// } -// if (CALC_COEFF) { // no need to test if in_next -// res -= dwrf[get_idx(lddwrf1, lddwrf2, r_gl, c_gl, f_gl)]; -// } -// } -// dwrf[get_idx(lddwrf1, lddwrf2, r_gl, c_gl, f_gl)] = res; -// } -// } -// } -// } -// } - -// if (f + (F/2) * 2 == nf_p - 1) { -// if (threadId >= (R/2) * (C/2) * (F/2) * 2 && threadId < (R/2) * (C/2) * -// (F/2) * 2 + (R/2) * (C/2)) { -// if (dwc) { -// r_sm = ((threadId - (R/2) * (C/2) * (F/2) * 2) / (C/2)) * 2; -// c_sm = ((threadId - (R/2) * (C/2) * (F/2) * 2) % (C/2)) * 2 + 1; -// f_sm = (F/2) * 2; -// r_gl = r / 2 + (threadId - (R/2) * (C/2) * (F/2) * 2) / (C/2); -// c_gl = c / 2 + (threadId - (R/2) * (C/2) * (F/2) * 2) % (C/2); -// f_gl = f / 2 + (F/2); -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)]; -// if (TYPE == 1) { -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf_c) { -// if (!skip) { -// if (INTERPOLATION) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, -// f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, -// f_sm)], ratio_c_sm[c_sm - 1]); -// } -// if (INTERPOLATION && CALC_COEFF) { -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res; -// } -// if (!INTERPOLATION && CALC_COEFF) { -// res -= dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)]; -// } -// } -// dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)] = res; -// } -// } else if (TYPE == 2) { -// f_gl *= 2; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf) { -// if (!skip) { -// if (INTERPOLATION) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, -// f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, -// f_sm)], ratio_c_sm[c_sm - 1]); -// } -// if (CALC_COEFF) { -// res -= dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)]; -// } -// } -// dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)] = res; -// } -// } -// } - -// if (dwr) { -// r_sm = ((threadId - (R/2) * (C/2) * (F/2) * 2) / (C/2)) * 2 + 1; -// c_sm = ((threadId - (R/2) * (C/2) * (F/2) * 2) % (C/2)) * 2; -// f_sm = (F/2) * 2; -// r_gl = r / 2 + (threadId - (R/2) * (C/2) * (F/2) * 2) / (C/2); -// c_gl = c / 2 + (threadId - (R/2) * (C/2) * (F/2) * 2) % (C/2); -// f_gl = f / 2 + (F/2); -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)]; -// if (TYPE == 1) { -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf_c) { -// if (!skip) { -// if (INTERPOLATION) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, -// f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, -// f_sm)], ratio_r_sm[r_sm - 1]); -// } -// if (INTERPOLATION && CALC_COEFF) { -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res; -// } -// if (!INTERPOLATION && CALC_COEFF) { -// res -= dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)]; -// } -// } -// dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)] = res; -// } -// } else if (TYPE == 2) { -// f_gl *= 2; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf) { -// if (!skip) { -// if (INTERPOLATION) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, -// f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, -// f_sm)], ratio_r_sm[r_sm - 1]); -// } -// if (CALC_COEFF) { -// res -= dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)]; -// } -// } -// dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)] = res; -// } -// } -// } - -// if (dwrc) { -// r_sm = ((threadId - (R/2) * (C/2) * (F/2) * 2) / (C/2)) * 2 + 1; -// c_sm = ((threadId - (R/2) * (C/2) * (F/2) * 2) % (C/2)) * 2 + 1; -// f_sm = (F/2) * 2; -// r_gl = r / 2 + (threadId - (R/2) * (C/2) * (F/2) * 2) / (C/2); -// c_gl = c / 2 + (threadId - (R/2) * (C/2) * (F/2) * 2) % (C/2); -// f_gl = f / 2 + (F/2); -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)]; -// if (TYPE == 1) { -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr - nr_c && c_gl < nc - nc_c && f_gl < nf_c) { -// if (!skip) { -// if (INTERPOLATION) { -// T c1 = lerp( -// v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm - 1, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm + 1, f_sm)], -// ratio_c_sm[c_sm - 1]); -// T c2 = lerp( -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm - 1, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm + 1, f_sm)], -// ratio_c_sm[c_sm - 1]); -// res = lerp(c1, c2, ratio_r_sm[r_sm - 1]); -// } -// if (INTERPOLATION && CALC_COEFF) { -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res; -// } -// if (!INTERPOLATION && CALC_COEFF) { -// res -= dwrc[get_idx(lddwrc1, lddwrc2, r_gl, c_gl, f_gl)]; -// } -// } -// dwrc[get_idx(lddwrc1, lddwrc2, r_gl, c_gl, f_gl)] = res; -// } -// } else if (TYPE == 2) { -// f_gl *= 2; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr - nr_c && c_gl < nc - nc_c && f_gl < nf) { -// if (!skip) { -// if (INTERPOLATION) { -// T c1 = lerp( -// v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm - 1, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm + 1, f_sm)], -// ratio_c_sm[c_sm - 1]); -// T c2 = lerp( -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm - 1, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm + 1, f_sm)], -// ratio_c_sm[c_sm - 1]); -// res = lerp(c1, c2, ratio_r_sm[r_sm - 1]); -// } -// if (CALC_COEFF) { -// res -= dwrc[get_idx(lddwrc1, lddwrc2, r_gl, c_gl, f_gl)]; -// } -// } -// dwrc[get_idx(lddwrc1, lddwrc2, r_gl, c_gl, f_gl)] = res; -// } -// } -// } -// } -// } - -// if (dwr && c + (C/2) * 2 == nc_p - 1 && f + (F/2) * 2 == nf_p - 1) { -// if (threadId >= (R/2) * (C/2) * (F/2) * 3 && threadId < (R/2) * (C/2) * -// (F/2) * 3 + (R/2)) { -// r_sm = (threadId - (R/2) * (C/2) * (F/2) * 3) * 2 + 1; -// c_sm = (C/2) * 2; -// f_sm = (F/2) * 2; -// r_gl = r / 2 + threadId - (R/2) * (C/2) * (F/2) * 3; -// c_gl = c / 2 + (C/2); -// f_gl = f / 2 + (F/2); -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)]; -// if (TYPE == 1) { -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf_c) { -// if (!skip) { -// if (INTERPOLATION) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm)], -// ratio_r_sm[r_sm - 1]); -// } -// if (INTERPOLATION && CALC_COEFF) { -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res; -// } -// if (!INTERPOLATION && CALC_COEFF) { -// res -= dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)]; -// } -// } -// dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)] = res; -// } -// } else if (TYPE == 2) { -// f_gl *= 2; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf) { -// if (!skip) { -// if (INTERPOLATION) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm)], -// ratio_r_sm[r_sm - 1]); -// } -// if (CALC_COEFF) { -// res -= dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)]; -// } -// } -// dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)] = res; -// } -// } -// } -// } - -// if (dwc && r + (R/2) * 2 == nr_p - 1 && f + (F/2) * 2 == nf_p - 1) { -// if (threadId >= (R/2) * (C/2) * (F/2) * 4 && threadId < (R/2) * (C/2) * -// (F/2) * 4 + (C/2)) { -// r_sm = (R/2) * 2; -// c_sm = (threadId - (R/2) * (C/2) * (F/2) * 4) * 2 + 1; -// f_sm = (F/2) * 2; -// r_gl = r / 2 + (R/2); -// c_gl = c / 2 + threadId - (R/2) * (C/2) * (F/2) * 4; -// f_gl = f / 2 + (F/2); -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)]; -// if (TYPE == 1) { -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf_c) { -// if (!skip) { -// if (INTERPOLATION) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm)], -// ratio_c_sm[c_sm - 1]); -// } -// if (INTERPOLATION && CALC_COEFF) { -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res; -// } -// if (!INTERPOLATION && CALC_COEFF) { -// res -= dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)]; -// } -// } -// dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)] = res; -// } -// } else if (TYPE == 2) { -// f_gl *= 2; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf) { -// if (!skip) { -// if (INTERPOLATION) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm)], -// ratio_c_sm[c_sm - 1]); -// } -// if (CALC_COEFF) { -// res -= dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)]; -// } -// } -// dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)] = res; -// } -// } -// } -// } - -// // printf("test1\n"); -// if (dwf && r + (R/2) * 2 == nr_p - 1 && c + (C/2) * 2 == nc_p - 1) { -// // printf("test2\n"); -// if (threadId >= (R/2) * (C/2) * (F/2) * 5 && threadId < (R/2) * (C/2) * -// (F/2) * 5 + (F/2)) { -// // printf("test3\n"); -// r_sm = (R/2) * 2; -// c_sm = (C/2) * 2; -// f_sm = (threadId - (R/2) * (C/2) * (F/2) * 5) * 2 + 1; -// r_gl = r / 2 + (R/2); -// c_gl = c / 2 + (C/2); -// f_gl = f / 2 + threadId - (R/2) * (C/2) * (F/2) * 5; -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)]; -// if (TYPE == 1) { -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf - nf_c) { -// // printf("test4\n"); -// if (!skip) { -// if (INTERPOLATION) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm - 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm + 1)], -// ratio_f_sm[f_sm - 1]); -// } -// if (INTERPOLATION && CALC_COEFF) { -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res; -// } -// if (!INTERPOLATION && CALC_COEFF) { -// res -= dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)]; -// } -// } -// dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)] = res; -// // printf("dwf(%d %d %d): %f\n", r_gl, c_gl, f_gl, -// // dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)]); -// } -// } else if (TYPE == 2) { -// f_gl = 2 * f_gl + 1; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf) { -// if (!skip) { -// if (INTERPOLATION) { -// ; -// } -// if (CALC_COEFF) { // do need to test in_next -// if (in_next && f_gl < nf_c) { -// ; -// } // in_next -// else { -// res -= dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)]; -// } -// } -// } -// dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)] = res; -// } -// } -// } -// } - -// } // skip - -// // if (r == 0 && c == 0 && f == 0 && threadId == 0) { -// // printf("out config: %d %d %d (%d %d %d)\n", (R/2), (C/2), (F/2), -// r,c,f); -// // for (int i = 0; i < (R/2) * 2 + 1; i++) { -// // for (int j = 0; j < (C/2) * 2 + 1; j++) { -// // for (int k = 0; k < (F/2) * 2 + 1; k++) { -// // printf("%2.2f ", v_sm[get_idx(ldsm1, ldsm2, i, j, k)]); -// // } -// // printf("\n"); -// // } -// // printf("\n"); -// // } -// // } -// } - -// template -// void gpk_reo_adaptive_launcher( -// Handle &handle, SIZE *shape_h, SIZE *shape_d, SIZE -// *shape_c_d, SIZE *ldvs, SIZE *ldws, DIM unprocessed_n, DIM -// *unprocessed_dims, DIM curr_dim_r, DIM curr_dim_c, DIM curr_dim_f, T -// *dratio_r, T *dratio_c, T *dratio_f, T *dv, LENGTH lddv1, LENGTH lddv2, T -// *dw, LENGTH lddw1, LENGTH lddw2, T *dwf, LENGTH lddwf1, LENGTH lddwf2, T -// *dwc, LENGTH lddwc1, LENGTH lddwc2, T *dwr, LENGTH lddwr1, LENGTH lddwr2, -// T *dwcf, LENGTH lddwcf1, LENGTH lddwcf2, T *dwrf, LENGTH lddwrf1, LENGTH -// lddwrf2, T *dwrc, LENGTH lddwrc1, LENGTH lddwrc2, T *dwrcf, LENGTH -// lddwrcf1, LENGTH lddwrcf2, int queue_idx) { - -// SIZE nr = shape_h[curr_dim_r]; -// SIZE nc = shape_h[curr_dim_c]; -// SIZE nf = shape_h[curr_dim_f]; -// if (D_LOCAL == 2) { -// nr = 1; -// } -// SIZE total_thread_z = std::max(nr - 1, (SIZE)1); -// SIZE total_thread_y = std::max(nc - 1, (SIZE)1); -// SIZE total_thread_x = std::max(nf - 1, (SIZE)1); - -// SIZE tbx, tby, tbz, gridx, gridy, gridz; -// dim3 threadsPerBlock, blockPerGrid; -// SIZE sm_size; -// // const int R = 4; -// // const int C = 4; -// // const int F = 16; -// // tbz = std::min(R, total_thread_z); -// // tby = std::min(C, total_thread_y); -// // tbx = std::min(F, total_thread_x); -// tbz = R; -// tby = C; -// tbx = F; -// sm_size = ((R + 1) * (C + 1) * (F + 1) + R + C + F) * sizeof(T); -// sm_size += (D_GLOBAL * 4) * sizeof(SIZE); -// sm_size += (D_GLOBAL * 1) * sizeof(DIM); - -// // printf("sm_size: %llu\n", sm_size); -// // printf("RCF: %u %u %u\n", R, C, F); - -// gridz = ceil((float)total_thread_z / tbz); -// gridy = ceil((float)total_thread_y / tby); -// gridx = ceil((float)total_thread_x / tbx); -// for (DIM d = 0; d < D_GLOBAL; d++) { -// if (D_LOCAL == 3 && d != curr_dim_f && d != curr_dim_c && d != -// curr_dim_r) { -// gridx *= shape_h[d]; -// } -// if (D_LOCAL == 2 && d != curr_dim_f && d != curr_dim_c) { -// gridx *= shape_h[d]; -// } -// } -// threadsPerBlock = dim3(tbx, tby, tbz); -// blockPerGrid = dim3(gridx, gridy, gridz); - -// // printf("_gpk_reo exec config (%d %d %d) (%d %d %d)\n", tbx, tby, tbz, -// // gridx, gridy, gridz); - -// // high_resolution_clock::time_point t1 = high_resolution_clock::now(); -// _gpk_reo<<>>( -// shape_d, shape_c_d, ldvs, ldws, unprocessed_n, unprocessed_dims, -// curr_dim_r, curr_dim_c, curr_dim_f, dratio_r, dratio_c, dratio_f, dv, -// lddv1, lddv2, dw, lddw1, lddw2, dwf, lddwf1, lddwf2, dwc, lddwc1, -// lddwc2, dwr, lddwr1, lddwr2, dwcf, lddwcf1, lddwcf2, dwrf, lddwrf1, -// lddwrf2, dwrc, lddwrc1, lddwrc2, dwrcf, lddwrcf1, lddwrcf2); - -// gpuErrchk(cudaGetLastError()); -// if (handle.sync_and_check_all_kernels) { -// gpuErrchk(cudaDeviceSynchronize()); -// } -// } - -// template -// void gpk_reo(Handle &handle, SIZE *shape_h, SIZE *shape_d, -// SIZE *shape_c_d, SIZE *ldvs, SIZE *ldws, DIM unprocessed_n, -// DIM *unprocessed_dims, DIM curr_dim_r, DIM curr_dim_c, -// DIM curr_dim_f, T *dratio_r, T *dratio_c, T *dratio_f, T *dv, -// LENGTH lddv1, LENGTH lddv2, T *dw, LENGTH lddw1, LENGTH lddw2, T -// *dwf, LENGTH lddwf1, LENGTH lddwf2, T *dwc, LENGTH lddwc1, -// LENGTH lddwc2, T *dwr, LENGTH lddwr1, LENGTH lddwr2, T *dwcf, -// LENGTH lddwcf1, LENGTH lddwcf2, T *dwrf, LENGTH lddwrf1, LENGTH -// lddwrf2, T *dwrc, LENGTH lddwrc1, LENGTH lddwrc2, T *dwrcf, -// LENGTH lddwrcf1, LENGTH lddwrcf2, int queue_idx, int config) { - -// #define GPK(R, C, F) \ -// { \ -// gpk_reo_adaptive_launcher( \ -// handle, shape_h, shape_d, shape_c_d, ldvs, ldws, unprocessed_n, \ -// unprocessed_dims, curr_dim_r, curr_dim_c, curr_dim_f, dratio_r, \ -// dratio_c, dratio_f, dv, lddv1, lddv2, dw, lddw1, lddw2, dwf, lddwf1, \ -// lddwf2, dwc, lddwc1, lddwc2, dwr, lddwr1, lddwr2, dwcf, lddwcf1, \ -// lddwcf2, dwrf, lddwrf1, lddwrf2, dwrc, lddwrc1, lddwrc2, dwrcf, \ -// lddwrcf1, lddwrcf2, queue_idx); \ -// } -// bool profile = false; -// if (handle.profile_kernels) { -// profile = true; -// } -// if (D_LOCAL == 3) { -// if (profile || config == 6) { -// GPK(2, 2, 128) -// } -// if (profile || config == 5) { -// GPK(2, 2, 64) -// } -// if (profile || config == 4) { -// GPK(4, 4, 32) -// } -// if (profile || config == 3) { -// GPK(4, 4, 16) -// } -// if (profile || config == 2) { -// GPK(4, 4, 8) -// } -// if (profile || config == 1) { -// GPK(4, 4, 4) -// } -// if (profile || config == 0) { -// GPK(2, 2, 2) -// } -// // GPK(T, 4, 4, 4) -// } else if (D_LOCAL == 2) { -// if (profile || config == 6) { -// GPK(1, 2, 128) -// } -// if (profile || config == 5) { -// GPK(1, 2, 64) -// } -// if (profile || config == 4) { -// GPK(1, 4, 32) -// } -// if (profile || config == 3) { -// GPK(1, 4, 16) -// } -// if (profile || config == 2) { -// GPK(1, 4, 8) -// } -// if (profile || config == 1) { -// GPK(1, 4, 4) -// } -// if (profile || config == 0) { -// GPK(1, 2, 2) -// } -// // GPK(T, 1, 4, 4) -// } else if (D_LOCAL == 1) { -// if (profile || config == 6) { -// GPK(1, 1, 128) -// } -// if (profile || config == 5) { -// GPK(1, 1, 64) -// } -// if (profile || config == 4) { -// GPK(1, 1, 32) -// } -// if (profile || config == 3) { -// GPK(1, 1, 16) -// } -// if (profile || config == 2) { -// GPK(1, 1, 8) -// } -// if (profile || config == 1) { -// GPK(1, 1, 4) -// } -// if (profile || config == 0) { -// GPK(1, 1, 2) -// } -// } -// #undef GPK -// } - -// template -// __global__ void -// _gpk_rev(SIZE *shape, SIZE *shape_c, -// SIZE *ldvs, SIZE *ldws, -// DIM unprocessed_n, DIM *unprocessed_dims, -// DIM curr_dim_r, DIM curr_dim_c, DIM curr_dim_f, -// T *dratio_r, T *dratio_c, T *dratio_f, -// T *dv, LENGTH lddv1, LENGTH lddv2, -// T *dw, LENGTH lddw1, LENGTH lddw2, T *dwf, LENGTH lddwf1, LENGTH -// lddwf2, T *dwc, LENGTH lddwc1, LENGTH lddwc2, T *dwr, LENGTH lddwr1, -// LENGTH lddwr2, T *dwcf, LENGTH lddwcf1, LENGTH lddwcf2, T *dwrf, -// LENGTH lddwrf1, LENGTH lddwrf2, T *dwrc, LENGTH lddwrc1, LENGTH -// lddwrc2, T *dwrcf, LENGTH lddwrcf1, LENGTH lddwrcf2, SIZE svr, SIZE -// svc, SIZE svf, SIZE nvr, SIZE nvc, SIZE nvf) { - -// // bool debug = false; -// // if (FunctorBase::GetBlockIdX() == 0 && -// FunctorBase::GetBlockIdY() == 0 && -// FunctorBase::GetBlockIdZ() == 0 && -// // threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) -// // debug = false; - -// // bool debug2 = false; -// // if (FunctorBase::GetBlockIdX() == 0 && -// FunctorBase::GetBlockIdY() == 0 && -// FunctorBase::GetBlockIdZ() == 0) -// // debug2 = false; - -// LENGTH threadId = (threadIdx.z * (FunctorBase::GetBlockDimX() * -// FunctorBase::GetBlockDimY())) + -// (threadIdx.y * FunctorBase::GetBlockDimX()) + -// threadIdx.x; - -// SIZE nr, nc, nf; -// SIZE nr_c, nc_c, nf_c; -// SIZE r, c, f; -// SIZE rest_r, rest_c, rest_f; -// SIZE nr_p, nc_p, nf_p; -// SIZE rest_r_p, rest_c_p, rest_f_p; -// SIZE r_sm, c_sm, f_sm; -// SIZE r_sm_ex, c_sm_ex, f_sm_ex; -// SIZE r_gl, c_gl, f_gl; -// SIZE r_gl_ex, c_gl_ex, f_gl_ex; -// T res; -// bool in_next = true; - -// T *sm = SharedMemory(); -// SIZE ldsm1 = (F/2) * 2 + 1; -// SIZE ldsm2 = (C/2) * 2 + 1; - -// T *v_sm = sm; sm += ((F/2) * 2 + 1) * ((C/2) * 2 + 1) * ((R/2) * 2 + 1); -// T *ratio_f_sm = sm; sm += (F/2) * 2; -// T *ratio_c_sm = sm; sm += (C/2) * 2; -// T *ratio_r_sm = sm; sm += (R/2) * 2; - -// SIZE * sm_size = (SIZE*)sm; -// SIZE *shape_sm = sm_size; sm_size += D_GLOBAL; -// SIZE *shape_c_sm = sm_size; sm_size += D_GLOBAL; -// SIZE *ldvs_sm = sm_size; sm_size += D_GLOBAL; -// SIZE *ldws_sm = sm_size; sm_size += D_GLOBAL; -// sm = (T*)sm_size; - -// DIM * sm_dim = (DIM*)sm; -// DIM *unprocessed_dims_sm = sm_dim; sm_dim += D_GLOBAL; -// sm = (T*)sm_dim; - -// SIZE idx[D_GLOBAL]; -// if (threadId < D_GLOBAL) { -// shape_sm[threadId] = shape[threadId]; -// shape_c_sm[threadId] = shape_c[threadId]; -// ldvs_sm[threadId] = ldvs[threadId]; -// ldws_sm[threadId] = ldws[threadId]; -// } - -// if (threadId < unprocessed_n) { -// unprocessed_dims_sm[threadId] = unprocessed_dims[threadId]; -// } -// __syncthreads(); -// for (DIM d = 0; d < D_GLOBAL; d++) -// idx[d] = 0; - -// nr = shape_sm[curr_dim_r]; -// nc = shape_sm[curr_dim_c]; -// nf = shape_sm[curr_dim_f]; - -// nr_c = shape_c_sm[curr_dim_r]; -// nc_c = shape_c_sm[curr_dim_c]; -// nf_c = shape_c_sm[curr_dim_f]; - -// if (D_LOCAL < 3) { -// nr = 1; -// nr_c = 1; -// } -// if (D_LOCAL < 2) { -// nc = 1; -// nc_c = 1; -// } - -// r = FunctorBase::GetBlockIdZ() * -// FunctorBase::GetBlockDimZ(); c = -// FunctorBase::GetBlockIdY() * -// FunctorBase::GetBlockDimY(); SIZE bidx = -// FunctorBase::GetBlockIdX(); SIZE firstD = -// div_roundup(shape_sm[0] - 1, FunctorBase::GetBlockDimX()); f = -// (bidx % firstD) * FunctorBase::GetBlockDimX(); - -// bidx /= firstD; - -// rest_r = nr - r; -// rest_c = nc - c; -// rest_f = nf - f; - -// nr_p = nr; -// nc_p = nc; -// nf_p = nf; - -// rest_r_p = rest_r; -// rest_c_p = rest_c; -// rest_f_p = rest_f; - -// if (nr % 2 == 0) { -// nr_p = nr + 1; -// rest_r_p = nr_p - r; -// } -// if (nc % 2 == 0) { -// nc_p = nc + 1; -// rest_c_p = nc_p - c; -// } -// if (nf % 2 == 0) { -// nf_p = nf + 1; -// rest_f_p = nf_p - f; -// } - -// for (int d = 0; d < D_GLOBAL; d++) { -// if (D_LOCAL == 3 && d != curr_dim_r && d != curr_dim_c && d != -// curr_dim_f) { -// idx[d] = bidx % shape_sm[d]; -// bidx /= shape_sm[d]; -// if ((shape_sm[d] % 2 == 1 && idx[d] % 2 != 0) || -// shape_sm[d] % 2 == 0 && -// (idx[d] % 2 != 0 && idx[d] != shape_sm[d] - 1)) -// in_next = false; -// } -// if (D_LOCAL == 2 && d != curr_dim_c && d != curr_dim_f) { -// idx[d] = bidx % shape_sm[d]; -// bidx /= shape_sm[d]; -// if ((shape_sm[d] % 2 == 1 && idx[d] % 2 != 0) || -// shape_sm[d] % 2 == 0 && -// (idx[d] % 2 != 0 && idx[d] != shape_sm[d] - 1)) -// in_next = false; -// } -// } - -// int skip = 0; -// #pragma unroll 1 -// for (DIM t = 0; t < D_GLOBAL; t++) { -// for (DIM k = 0; k < unprocessed_n; k++) { -// if (t == unprocessed_dims_sm[k] && idx[t] >= shape_c_sm[t]) { -// skip = 1; -// } -// } -// } - -// // if (FunctorBase::GetBlockIdX() == 0 && -// FunctorBase::GetBlockIdY() == 0 && -// FunctorBase::GetBlockIdZ() == 0) { -// // if (threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) { -// // printf("TYPE %d total_idx_sm: %d %d %d %d (skip: %d)\n", TYPE, idx[3], -// // idx[2], idx[1], idx[0], skip); -// // } -// // } - -// LENGTH other_offset_v = get_idx(ldvs_sm, idx); -// LENGTH other_offset_w = get_idx(ldws_sm, idx); - -// dv = dv + other_offset_v; -// dw = dw + other_offset_w; -// dwr = dwr + other_offset_w; -// dwc = dwc + other_offset_w; -// dwf = dwf + other_offset_w; -// dwrf = dwrf + other_offset_w; -// dwrc = dwrc + other_offset_w; -// dwcf = dwcf + other_offset_w; -// dwrcf = dwrcf + other_offset_w; - -// if (TYPE == 2) { -// dwf = dw; -// dwcf = dwc; -// dwrf = dwr; -// dwrcf = dwrc; -// } -// __syncthreads(); - -// r_sm = threadIdx.z; -// c_sm = threadIdx.y; -// f_sm = threadIdx.x; - -// r_sm_ex = (R/2) * 2; -// c_sm_ex = (C/2) * 2; -// f_sm_ex = (F/2) * 2; - -// r_gl = r + r_sm; -// r_gl_ex = r + (R/2) * 2; -// c_gl = c + c_sm; -// c_gl_ex = c + (C/2) * 2; -// f_gl = f + f_sm; -// f_gl_ex = f + (F/2) * 2; - -// // load dist -// if (c_sm == 0 && f_sm == 0 && r_sm < rest_r - 2) { -// ratio_r_sm[r_sm] = dratio_r[r + r_sm]; -// if (nr % 2 == 0 && (R/2) * 2 + 1 >= rest_r_p && r_sm == 0) { -// ratio_r_sm[rest_r_p - 3] = 0.5; -// } -// } -// if (r_sm == 0 && f_sm == 0 && c_sm < rest_c - 2) { -// ratio_c_sm[c_sm] = dratio_c[c + c_sm]; -// if (nc % 2 == 0 && (C/2) * 2 + 1 >= rest_c_p && c_sm == 0) { -// ratio_c_sm[rest_c_p - 3] = 0.5; -// } -// } -// if (c_sm == 0 && r_sm == 0 && f_sm < rest_f - 2) { -// ratio_f_sm[f_sm] = dratio_f[f + f_sm]; -// if (nf % 2 == 0 && (F/2) * 2 + 1 >= rest_f_p && f_sm == 0) { -// ratio_f_sm[rest_f_p - 3] = 0.5; -// } -// } - -// if (threadIdx.z == 0 && threadIdx.y == 0 && threadIdx.x == 0) { -// for (int i = 0; i < (R/2) * 2 + 1; i++) { -// for (int j = 0; j < (C/2) * 2 + 1; j++) { -// for (int k = 0; k < (F/2) * 2 + 1; k++) { -// v_sm[get_idx(ldsm1, ldsm2, i, j, k)] = 0.0; -// } -// } -// } -// } - -// __syncthreads(); - -// if (dw && threadId < (R/2) * (C/2) * (F/2)) { -// r_sm = (threadId / ((C/2) * (F/2))) * 2; -// c_sm = ((threadId % ((C/2) * (F/2))) / (F/2)) * 2; -// f_sm = ((threadId % ((C/2) * (F/2))) % (F/2)) * 2; -// r_gl = r / 2 + threadId / ((C/2) * (F/2)); -// c_gl = c / 2 + threadId % ((C/2) * (F/2)) / (F/2); -// f_gl = f / 2 + threadId % ((C/2) * (F/2)) % (F/2); -// if (TYPE == 1) { -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) { -// if (!INTERPOLATION && COEFF_RESTORE) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = 0.0; -// } else { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)]; -// } -// } -// } else if (TYPE == 2) { -// f_gl *= 2; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)]; -// // if (debug2) -// // printf("(%d %d %d) %f <- (%d %d %d)\n", r_sm, c_sm, f_sm, -// // dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)], r_gl, c_gl, -// f_gl); if (!skip) { -// if (INTERPOLATION) { -// ; -// } -// if (COEFF_RESTORE) { -// bool f_in_next = (nf % 2 == 1 && f_gl % 2 == 0) || -// (nf % 2 == 0 && (f_gl % 2 == 0 || f_gl == nf - -// 1)); -// if (in_next && f_in_next) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = 0.0; -// } else { -// ; -// } -// } -// } -// } - -// f_gl += 1; -// f_sm += 1; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)]; -// // if (debug2) -// // printf("(%d %d %d) %f <- (%d %d %d)\n", r_sm, c_sm, f_sm, -// // dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)], r_gl, c_gl, -// f_gl); if (!skip) { -// if (INTERPOLATION) { -// ; -// } -// if (COEFF_RESTORE) { -// bool f_in_next = (nf % 2 == 1 && f_gl % 2 == 0) || -// (nf % 2 == 0 && (f_gl % 2 == 0 || f_gl == nf - -// 1)); -// if (in_next && f_in_next) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = 0.0; -// } else { -// ; -// } -// } -// } -// } -// } -// } - -// int base = 0; -// if (dw && threadId >= base && threadId < base + (C/2) * (F/2)) { -// r_sm = (R/2) * 2; -// c_sm = ((threadId - base) / (F/2)) * 2; -// f_sm = ((threadId - base) % (F/2)) * 2; -// r_gl = r / 2 + (R/2); -// c_gl = c / 2 + (threadId - base) / (F/2); -// f_gl = f / 2 + (threadId - base) % (F/2); -// if (TYPE == 1) { -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) { -// if (!INTERPOLATION && COEFF_RESTORE) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = 0.0; -// } else { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)]; -// } -// } -// } else if (TYPE == 2) { -// f_gl *= 2; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)]; -// if (!skip) { -// if (INTERPOLATION) { -// ; -// } -// if (COEFF_RESTORE) { -// bool f_in_next = (nf % 2 == 1 && f_gl % 2 == 0) || -// (nf % 2 == 0 && (f_gl % 2 == 0 || f_gl == nf - -// 1)); -// // printf("nf: %d, f_gl: %d, in_next: %d, f_in_next: %d\n", nf, -// // f_gl, in_next, f_in_next); -// if (in_next && f_in_next) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = 0.0; -// } else { -// ; -// } -// } -// } -// } - -// f_gl += 1; -// f_sm += 1; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)]; -// // if (debug2) -// // printf("(%d %d %d) %f <- (%d %d %d)\n", r_sm, c_sm, f_sm, -// // dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)], r_gl, c_gl, -// f_gl); if (!skip) { -// if (INTERPOLATION) { -// ; -// } -// if (COEFF_RESTORE) { -// bool f_in_next = (nf % 2 == 1 && f_gl % 2 == 0) || -// (nf % 2 == 0 && (f_gl % 2 == 0 || f_gl == nf - -// 1)); -// // printf("nf: %d, f_gl: %d, in_next: %d, f_in_next: %d\n", nf, -// // f_gl, in_next, f_in_next); -// if (in_next && f_in_next) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = 0.0; -// } else { -// ; -// } -// } -// } -// } -// } -// } -// base += (C/2) * (F/2); // ROUND_UP_WARP((C/2) * (F/2)) * WARP_SIZE; -// if (dw && threadId >= base && threadId < base + (R/2) * (F/2)) { -// r_sm = ((threadId - base) / (F/2)) * 2; -// c_sm = (C/2) * 2; -// f_sm = ((threadId - base) % (F/2)) * 2; -// r_gl = r / 2 + (threadId - base) / (F/2); -// c_gl = c / 2 + (C/2); -// f_gl = f / 2 + (threadId - base) % (F/2); -// if (TYPE == 1) { -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) { -// if (!INTERPOLATION && COEFF_RESTORE) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = 0.0; -// } else { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)]; -// } -// } -// } else if (TYPE == 2) { -// f_gl *= 2; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)]; -// if (!skip) { -// if (INTERPOLATION) { -// ; -// } -// if (COEFF_RESTORE) { -// bool f_in_next = (nf % 2 == 1 && f_gl % 2 == 0) || -// (nf % 2 == 0 && (f_gl % 2 == 0 || f_gl == nf - -// 1)); -// if (in_next && f_in_next) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = 0.0; -// } else { -// ; -// } -// } -// } -// } - -// f_gl += 1; -// f_sm += 1; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)]; -// // if (debug2) -// // printf("(%d %d %d) %f <- (%d %d %d)\n", r_sm, c_sm, f_sm, -// // dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)], r_gl, c_gl, -// f_gl); if (!skip) { -// if (INTERPOLATION) { -// ; -// } -// if (COEFF_RESTORE) { -// bool f_in_next = (nf % 2 == 1 && f_gl % 2 == 0) || -// (nf % 2 == 0 && (f_gl % 2 == 0 || f_gl == nf - -// 1)); -// if (in_next && f_in_next) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = 0.0; -// } else { -// ; -// } -// } -// } -// } -// } -// } -// base += (R/2) * (F/2); // ROUND_UP_WARP((R/2) * (F/2)) * WARP_SIZE; -// if (dw && threadId >= base && threadId < base + (R/2) * (C/2)) { -// r_sm = ((threadId - base) / (C/2)) * 2; -// c_sm = ((threadId - base) % (C/2)) * 2; -// f_sm = (F/2) * 2; -// r_gl = r / 2 + (threadId - base) / (C/2); -// c_gl = c / 2 + (threadId - base) % (C/2); -// f_gl = f / 2 + (F/2); -// if (TYPE == 1) { -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) { -// if (!INTERPOLATION && COEFF_RESTORE) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = 0.0; -// } else { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)]; -// } -// } -// } else if (TYPE == 2) { -// f_gl *= 2; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)]; -// if (!skip) { -// if (INTERPOLATION) { -// ; -// } -// if (COEFF_RESTORE) { -// bool f_in_next = (nf % 2 == 1 && f_gl % 2 == 0) || -// (nf % 2 == 0 && (f_gl % 2 == 0 || f_gl == nf - -// 1)); -// if (in_next && f_in_next) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = 0.0; -// } else { -// ; -// } -// } -// } -// } - -// f_gl += 1; -// f_sm += 1; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)]; -// // if (debug2) -// // printf("(%d %d %d) %f <- (%d %d %d)\n", r_sm, c_sm, f_sm, -// // dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)], r_gl, c_gl, -// f_gl); if (!skip) { -// if (INTERPOLATION) { -// ; -// } -// if (COEFF_RESTORE) { -// bool f_in_next = (nf % 2 == 1 && f_gl % 2 == 0) || -// (nf % 2 == 0 && (f_gl % 2 == 0 || f_gl == nf - -// 1)); -// if (in_next && f_in_next) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = 0.0; -// } else { -// ; -// } -// } -// } -// } -// } -// } -// base += (R/2) * (C/2); // ROUND_UP_WARP((R/2) * (C/2)) * WARP_SIZE; -// // load extra edges -// if (dw && threadId >= base && threadId < base + (R/2)) { -// r_sm = (threadId - base) * 2; -// c_sm = (C/2) * 2; -// f_sm = (F/2) * 2; -// r_gl = r / 2 + threadId - base; -// c_gl = c / 2 + (C/2); -// f_gl = f / 2 + (F/2); -// if (TYPE == 1) { -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) { -// if (!INTERPOLATION && COEFF_RESTORE) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = 0.0; -// } else { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)]; -// } -// } -// } else if (TYPE == 2) { -// f_gl *= 2; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)]; -// if (!skip) { -// if (INTERPOLATION) { -// ; -// } -// if (COEFF_RESTORE) { -// bool f_in_next = (nf % 2 == 1 && f_gl % 2 == 0) || -// (nf % 2 == 0 && (f_gl % 2 == 0 || f_gl == nf - -// 1)); -// if (in_next && f_in_next) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = 0.0; -// } else { -// ; -// } -// } -// } -// } - -// f_gl += 1; -// f_sm += 1; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)]; -// // if (debug2) -// // printf("(%d %d %d) %f <- (%d %d %d)\n", r_sm, c_sm, f_sm, -// // dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)], r_gl, c_gl, -// f_gl); if (!skip) { -// if (INTERPOLATION) { -// ; -// } -// if (COEFF_RESTORE) { -// bool f_in_next = (nf % 2 == 1 && f_gl % 2 == 0) || -// (nf % 2 == 0 && (f_gl % 2 == 0 || f_gl == nf - -// 1)); -// if (in_next && f_in_next) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = 0.0; -// } else { -// ; -// } -// } -// } -// } -// } -// } -// base += (R/2); // ROUND_UP_WARP((R/2)) * WARP_SIZE; -// if (dw && threadId >= base && threadId < base + (C/2)) { -// r_sm = (R/2) * 2; -// c_sm = (threadId - base) * 2; -// f_sm = (F/2) * 2; -// r_gl = r / 2 + (R/2); -// c_gl = c / 2 + threadId - base; -// f_gl = f / 2 + (F/2); -// if (TYPE == 1) { -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) { -// if (!INTERPOLATION && COEFF_RESTORE) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = 0.0; -// } else { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)]; -// } -// } -// } else if (TYPE == 2) { -// f_gl *= 2; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)]; -// if (!skip) { -// if (INTERPOLATION) { -// ; -// } -// if (COEFF_RESTORE) { -// bool f_in_next = (nf % 2 == 1 && f_gl % 2 == 0) || -// (nf % 2 == 0 && (f_gl % 2 == 0 || f_gl == nf - -// 1)); -// if (in_next && f_in_next) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = 0.0; -// } else { -// ; -// } -// } -// } -// } - -// f_gl += 1; -// f_sm += 1; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)]; -// // if (debug2) -// // printf("(%d %d %d) %f <- (%d %d %d)\n", r_sm, c_sm, f_sm, -// // dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)], r_gl, c_gl, -// f_gl); if (!skip) { -// if (INTERPOLATION) { -// ; -// } -// if (COEFF_RESTORE) { -// bool f_in_next = (nf % 2 == 1 && f_gl % 2 == 0) || -// (nf % 2 == 0 && (f_gl % 2 == 0 || f_gl == nf - -// 1)); -// if (in_next && f_in_next) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = 0.0; -// } else { -// ; -// } -// } -// } -// } -// } -// } -// base += (C/2); // ROUND_UP_WARP((C/2)) * WARP_SIZE; -// if (dw && threadId >= base && threadId < base + (F/2)) { -// r_sm = (R/2) * 2; -// c_sm = (C/2) * 2; -// f_sm = (threadId - base) * 2; -// r_gl = r / 2 + (R/2); -// c_gl = c / 2 + (C/2); -// f_gl = f / 2 + threadId - base; -// if (TYPE == 1) { -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) { -// if (!INTERPOLATION && COEFF_RESTORE) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = 0.0; -// } else { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)]; -// } -// } -// } else if (TYPE == 2) { -// f_gl *= 2; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)]; -// if (!skip) { -// if (INTERPOLATION) { -// ; -// } -// if (COEFF_RESTORE) { -// bool f_in_next = (nf % 2 == 1 && f_gl % 2 == 0) || -// (nf % 2 == 0 && (f_gl % 2 == 0 || f_gl == nf - -// 1)); -// if (in_next && f_in_next) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = 0.0; -// } else { -// ; -// } -// } -// } -// } - -// f_gl += 1; -// f_sm += 1; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)]; -// // if (debug2) -// // printf("(%d %d %d) %f <- (%d %d %d)\n", r_sm, c_sm, f_sm, -// // dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)], r_gl, c_gl, -// f_gl); if (!skip) { -// if (INTERPOLATION) { -// ; -// } -// if (COEFF_RESTORE) { -// bool f_in_next = (nf % 2 == 1 && f_gl % 2 == 0) || -// (nf % 2 == 0 && (f_gl % 2 == 0 || f_gl == nf - -// 1)); -// if (in_next && f_in_next) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = 0.0; -// } else { -// ; -// } -// } -// } -// } -// } -// } -// base += (F/2); // ROUND_UP_WARP((F/2)) * WARP_SIZE; -// // // load extra vertex -// if (dw && threadId >= base && threadId < base + 1) { -// r_sm = (R/2) * 2; -// c_sm = (C/2) * 2; -// f_sm = (F/2) * 2; -// r_gl = r / 2 + (R/2); -// c_gl = c / 2 + (C/2); -// f_gl = f / 2 + (F/2); -// if (TYPE == 1) { -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) { -// if (!INTERPOLATION && COEFF_RESTORE) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = 0.0; -// } else { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)]; -// } -// } -// } else if (TYPE == 2) { -// f_gl *= 2; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)]; -// if (!skip) { -// if (INTERPOLATION) { -// ; -// } -// if (COEFF_RESTORE) { -// bool f_in_next = (nf % 2 == 1 && f_gl % 2 == 0) || -// (nf % 2 == 0 && (f_gl % 2 == 0 || f_gl == nf - -// 1)); -// if (in_next && f_in_next) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = 0.0; -// } else { -// ; -// } -// } -// } -// } - -// f_gl += 1; -// f_sm += 1; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)]; -// // if (debug2) -// // printf("(%d %d %d) %f <- (%d %d %d)\n", r_sm, c_sm, f_sm, -// // dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)], r_gl, c_gl, -// f_gl); if (!skip) { -// if (INTERPOLATION) { -// ; -// } -// if (COEFF_RESTORE) { -// bool f_in_next = (nf % 2 == 1 && f_gl % 2 == 0) || -// (nf % 2 == 0 && (f_gl % 2 == 0 || f_gl == nf - -// 1)); -// if (in_next && f_in_next) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = 0.0; -// } else { -// ; -// } -// } -// } -// } -// } -// } - -// __syncthreads(); - -// // __syncthreads(); -// // if (debug) { -// // printf("TYPE: %d %d %d %d\n", TYPE, min(rest_r_p, (R/2) * 2 + 1), -// // min(rest_c_p, (C/2) * 2 + 1), min(rest_f_p, (F/2) * 2 + 1)); -// // for (int i = 0; i < min(rest_r_p, (R/2) * 2 + 1); i++) { -// // for (int j = 0; j < min(rest_c_p, (C/2) * 2 + 1); j++) { -// // for (int k = 0; k < min(rest_f_p, (F/2) * 2 + 1); k++) { -// // printf("%2.2f ", v_sm[get_idx(ldsm1, ldsm2, i, j, k)]); -// // } -// // printf("\n"); -// // } -// // printf("\n"); -// // } -// // } -// // __syncthreads(); - -// if (dwf && threadId >= (R/2) * (C/2) * (F/2) && threadId < (R/2) * (C/2) * -// (F/2) * 2) { - -// r_sm = ((threadId - (R/2) * (C/2) * (F/2)) / ((C/2) * (F/2))) * 2; -// c_sm = (((threadId - (R/2) * (C/2) * (F/2)) % ((C/2) * (F/2))) / (F/2)) * -// 2; f_sm = (((threadId - (R/2) * (C/2) * (F/2)) % ((C/2) * (F/2))) % -// (F/2)) * 2 + 1; r_gl = r / 2 + (threadId - (R/2) * (C/2) * (F/2)) / -// ((C/2) * (F/2)); c_gl = c / 2 + ((threadId - (R/2) * (C/2) * (F/2)) % -// ((C/2) * (F/2))) / (F/2); f_gl = f / 2 + ((threadId - (R/2) * (C/2) * -// (F/2)) % ((C/2) * (F/2))) % (F/2); - -// if (TYPE == 1) { -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf - nf_c) { - -// res = dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)]; -// if (!skip) { -// if (INTERPOLATION && COEFF_RESTORE) { // fused -// res += lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm - 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm + 1)], -// ratio_f_sm[f_sm - 1]); -// } else if (INTERPOLATION && !COEFF_RESTORE) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm - 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm + 1)], -// ratio_f_sm[f_sm - 1]); -// } -// } -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res; -// } -// } else if (TYPE == 2) { -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf) { -// f_gl = 2 * f_gl + 1; -// // res = dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)]; -// if (!skip) { -// if (INTERPOLATION) { -// ; -// } -// } -// // v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res; -// } -// } -// } - -// if (dwc && threadId >= (R/2) * (C/2) * (F/2) * 2 && threadId < (R/2) * -// (C/2) * (F/2) * 3) { -// r_sm = ((threadId - (R/2) * (C/2) * (F/2) * 2) / ((C/2) * (F/2))) * 2; -// c_sm = (((threadId - (R/2) * (C/2) * (F/2) * 2) % ((C/2) * (F/2))) / -// (F/2)) * 2 + 1; f_sm = (((threadId - (R/2) * (C/2) * (F/2) * 2) % ((C/2) -// * (F/2))) % (F/2)) * 2; r_gl = r / 2 + (threadId - (R/2) * (C/2) * (F/2) -// * 2) / ((C/2) * (F/2)); c_gl = c / 2 + ((threadId - (R/2) * (C/2) * (F/2) -// * 2) % ((C/2) * (F/2))) / (F/2); f_gl = f / 2 + ((threadId - (R/2) * -// (C/2) * (F/2) * 2) % ((C/2) * (F/2))) % (F/2); if (TYPE == 1) { -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf_c) { -// res = dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)]; -// if (!skip) { -// if (INTERPOLATION && COEFF_RESTORE) { -// res += lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm)], -// ratio_c_sm[c_sm - 1]); -// } else if (INTERPOLATION && !COEFF_RESTORE) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm)], -// ratio_c_sm[c_sm - 1]); -// } -// } -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res; -// } -// } else if (TYPE == 2) { -// f_gl *= 2; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf) { -// res = dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)]; -// if (!skip) { -// if (INTERPOLATION) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm)], -// ratio_c_sm[c_sm - 1]); -// } -// } -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res; -// } -// } -// } - -// if (dwr && threadId >= (R/2) * (C/2) * (F/2) * 3 && threadId < (R/2) * -// (C/2) * (F/2) * 4) { -// r_sm = ((threadId - (R/2) * (C/2) * (F/2) * 3) / ((C/2) * (F/2))) * 2 + -// 1; c_sm = (((threadId - (R/2) * (C/2) * (F/2) * 3) % ((C/2) * (F/2))) / -// (F/2)) * 2; f_sm = (((threadId - (R/2) * (C/2) * (F/2) * 3) % ((C/2) * -// (F/2))) % (F/2)) * 2; r_gl = r / 2 + (threadId - (R/2) * (C/2) * (F/2) * -// 3) / ((C/2) * (F/2)); c_gl = c / 2 + ((threadId - (R/2) * (C/2) * (F/2) * -// 3) % ((C/2) * (F/2))) / (F/2); f_gl = f / 2 + ((threadId - (R/2) * (C/2) -// * (F/2) * 3) % ((C/2) * (F/2))) % (F/2); - -// if (TYPE == 1) { -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf_c) { -// res = dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)]; -// if (!skip) { -// if (INTERPOLATION && COEFF_RESTORE) { -// res += lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm)], -// ratio_r_sm[r_sm - 1]); -// } else if (INTERPOLATION && !COEFF_RESTORE) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm)], -// ratio_r_sm[r_sm - 1]); -// } -// } -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res; -// } -// } else if (TYPE == 2) { -// f_gl *= 2; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf) { -// res = dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)]; -// if (!skip) { -// if (INTERPOLATION) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm)], -// ratio_r_sm[r_sm - 1]); -// } -// } -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res; -// } -// } -// } - -// if (dwcf && threadId >= (R/2) * (C/2) * (F/2) * 4 && threadId < (R/2) * -// (C/2) * (F/2) * 5) { -// r_sm = ((threadId - (R/2) * (C/2) * (F/2) * 4) / ((C/2) * (F/2))) * 2; -// c_sm = (((threadId - (R/2) * (C/2) * (F/2) * 4) % ((C/2) * (F/2))) / -// (F/2)) * 2 + 1; f_sm = (((threadId - (R/2) * (C/2) * (F/2) * 4) % ((C/2) -// * (F/2))) % (F/2)) * 2 + 1; r_gl = r / 2 + (threadId - (R/2) * (C/2) * -// (F/2) * 4) / ((C/2) * (F/2)); c_gl = c / 2 + ((threadId - (R/2) * (C/2) * -// (F/2) * 4) % ((C/2) * (F/2))) / (F/2); f_gl = f / 2 + ((threadId - (R/2) -// * (C/2) * (F/2) * 4) % ((C/2) * (F/2))) % (F/2); - -// if (TYPE == 1) { -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf - nf_c) { -// res = dwcf[get_idx(lddwcf1, lddwcf2, r_gl, c_gl, f_gl)]; -// if (!skip) { -// if (INTERPOLATION && COEFF_RESTORE) { -// T f1 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm - -// 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm + -// 1)], ratio_f_sm[f_sm - 1]); -// T f2 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm - -// 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm + -// 1)], ratio_f_sm[f_sm - 1]); -// res += lerp(f1, f2, ratio_c_sm[c_sm - 1]); -// } else if (INTERPOLATION && !COEFF_RESTORE) { -// T f1 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm - -// 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm + -// 1)], ratio_f_sm[f_sm - 1]); -// T f2 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm - -// 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm + -// 1)], ratio_f_sm[f_sm - 1]); -// res = lerp(f1, f2, ratio_c_sm[c_sm - 1]); -// } -// } -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res; -// } -// } else if (TYPE == 2) { -// f_gl = 2 * f_gl + 1; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf) { -// res = dwcf[get_idx(lddwrf1, lddwrf2, r_gl, c_gl, f_gl)]; -// if (!skip) { -// if (INTERPOLATION) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm)], -// ratio_c_sm[c_sm - 1]); -// } -// } -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res; -// } -// } -// } - -// if (dwrf && threadId >= (R/2) * (C/2) * (F/2) * 5 && threadId < (R/2) * -// (C/2) * (F/2) * 6) { -// r_sm = ((threadId - (R/2) * (C/2) * (F/2) * 5) / ((C/2) * (F/2))) * 2 + -// 1; c_sm = (((threadId - (R/2) * (C/2) * (F/2) * 5) % ((C/2) * (F/2))) / -// (F/2)) * 2; f_sm = (((threadId - (R/2) * (C/2) * (F/2) * 5) % ((C/2) * -// (F/2))) % (F/2)) * 2 + 1; r_gl = r / 2 + (threadId - (R/2) * (C/2) * -// (F/2) * 5) / ((C/2) * (F/2)); c_gl = c / 2 + ((threadId - (R/2) * (C/2) * -// (F/2) * 5) % ((C/2) * (F/2))) / (F/2); f_gl = f / 2 + ((threadId - (R/2) -// * (C/2) * (F/2) * 5) % ((C/2) * (F/2))) % (F/2); - -// if (TYPE == 1) { -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf - nf_c) { - -// res = dwrf[get_idx(lddwrf1, lddwrf2, r_gl, c_gl, f_gl)]; -// if (!skip) { -// if (INTERPOLATION && COEFF_RESTORE) { -// T f1 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm - -// 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm + -// 1)], ratio_f_sm[f_sm - 1]); -// T f2 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm - -// 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm + -// 1)], ratio_f_sm[f_sm - 1]); - -// res += lerp(f1, f2, ratio_r_sm[r_sm - 1]); -// } else if (INTERPOLATION && !COEFF_RESTORE) { -// T f1 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm - -// 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm + -// 1)], ratio_f_sm[f_sm - 1]); -// T f2 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm - -// 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm + -// 1)], ratio_f_sm[f_sm - 1]); - -// res = lerp(f1, f2, ratio_r_sm[r_sm - 1]); -// } -// } -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res; -// } -// } else if (TYPE == 2) { -// f_gl = 2 * f_gl + 1; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf) { -// res = dwrf[get_idx(lddwrf1, lddwrf2, r_gl, c_gl, f_gl)]; -// if (!skip) { -// if (INTERPOLATION) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm)], -// ratio_r_sm[r_sm - 1]); -// } -// } -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res; -// } -// } -// } - -// if (dwrc && threadId >= (R/2) * (C/2) * (F/2) * 6 && threadId < (R/2) * -// (C/2) * (F/2) * 7) { -// r_sm = ((threadId - (R/2) * (C/2) * (F/2) * 6) / ((C/2) * (F/2))) * 2 + -// 1; c_sm = (((threadId - (R/2) * (C/2) * (F/2) * 6) % ((C/2) * (F/2))) / -// (F/2)) * 2 + 1; f_sm = (((threadId - (R/2) * (C/2) * (F/2) * 6) % ((C/2) -// * (F/2))) % (F/2)) * 2; r_gl = r / 2 + (threadId - (R/2) * (C/2) * (F/2) -// * 6) / ((C/2) * (F/2)); c_gl = c / 2 + ((threadId - (R/2) * (C/2) * (F/2) -// * 6) % ((C/2) * (F/2))) / (F/2); f_gl = f / 2 + ((threadId - (R/2) * -// (C/2) * (F/2) * 6) % ((C/2) * (F/2))) % (F/2); - -// if (TYPE == 1) { -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr - nr_c && c_gl < nc - nc_c && f_gl < nf_c) { -// res = dwrc[get_idx(lddwrc1, lddwrc2, r_gl, c_gl, f_gl)]; -// if (!skip) { -// if (INTERPOLATION && COEFF_RESTORE) { -// T c1 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm - 1, -// f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm + 1, -// f_sm)], ratio_c_sm[c_sm - 1]); -// T c2 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm - 1, -// f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm + 1, -// f_sm)], ratio_c_sm[c_sm - 1]); -// res += lerp(c1, c2, ratio_r_sm[r_sm - 1]); -// } else if (INTERPOLATION && !COEFF_RESTORE) { -// T c1 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm - 1, -// f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm + 1, -// f_sm)], ratio_c_sm[c_sm - 1]); -// T c2 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm - 1, -// f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm + 1, -// f_sm)], ratio_c_sm[c_sm - 1]); -// res = lerp(c1, c2, ratio_r_sm[r_sm - 1]); -// } -// } -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res; -// } -// } else if (TYPE == 2) { -// f_gl *= 2; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr - nr_c && c_gl < nc - nc_c && f_gl < nf) { -// res = dwrc[get_idx(lddwrc1, lddwrc2, r_gl, c_gl, f_gl)]; -// if (!skip) { -// if (INTERPOLATION) { -// T c1 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm - 1, -// f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm + 1, -// f_sm)], ratio_c_sm[c_sm - 1]); -// T c2 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm - 1, -// f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm + 1, -// f_sm)], ratio_c_sm[c_sm - 1]); -// res = lerp(c1, c2, ratio_r_sm[r_sm - 1]); -// } -// } -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res; -// } -// } -// } - -// if (dwrcf && threadId >= (R/2) * (C/2) * (F/2) * 7 && threadId < (R/2) * -// (C/2) * (F/2) * 8) { -// r_sm = ((threadId - (R/2) * (C/2) * (F/2) * 7) / ((C/2) * (F/2))) * 2 + -// 1; c_sm = (((threadId - (R/2) * (C/2) * (F/2) * 7) % ((C/2) * (F/2))) / -// (F/2)) * 2 + 1; f_sm = (((threadId - (R/2) * (C/2) * (F/2) * 7) % ((C/2) -// * (F/2))) % (F/2)) * 2 + 1; r_gl = r / 2 + (threadId - (R/2) * (C/2) * -// (F/2) * 7) / ((C/2) * (F/2)); c_gl = c / 2 + ((threadId - (R/2) * (C/2) * -// (F/2) * 7) % ((C/2) * (F/2))) / (F/2); f_gl = f / 2 + ((threadId - (R/2) -// * (C/2) * (F/2) * 7) % ((C/2) * (F/2))) % (F/2); - -// if (TYPE == 1) { -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr - nr_c && c_gl < nc - nc_c && f_gl < nf - nf_c) { -// res = dwrcf[get_idx(lddwrcf1, lddwrcf2, r_gl, c_gl, f_gl)]; -// if (!skip) { -// if (INTERPOLATION && COEFF_RESTORE) { -// T f1 = -// lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm - 1, f_sm - -// 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm - 1, f_sm + -// 1)], ratio_f_sm[f_sm - 1]); -// T f2 = -// lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm + 1, f_sm - -// 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm + 1, f_sm + -// 1)], ratio_f_sm[f_sm - 1]); -// T f3 = -// lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm - 1, f_sm - -// 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm - 1, f_sm + -// 1)], ratio_f_sm[f_sm - 1]); -// T f4 = -// lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm + 1, f_sm - -// 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm + 1, f_sm + -// 1)], ratio_f_sm[f_sm - 1]); - -// T fc1 = lerp(f1, f2, ratio_c_sm[c_sm - 1]); -// T fc2 = lerp(f3, f4, ratio_c_sm[c_sm - 1]); - -// res += lerp(fc1, fc2, ratio_r_sm[r_sm - 1]); -// } else if (INTERPOLATION && !COEFF_RESTORE) { -// T f1 = -// lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm - 1, f_sm - -// 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm - 1, f_sm + -// 1)], ratio_f_sm[f_sm - 1]); -// T f2 = -// lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm + 1, f_sm - -// 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm + 1, f_sm + -// 1)], ratio_f_sm[f_sm - 1]); -// T f3 = -// lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm - 1, f_sm - -// 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm - 1, f_sm + -// 1)], ratio_f_sm[f_sm - 1]); -// T f4 = -// lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm + 1, f_sm - -// 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm + 1, f_sm + -// 1)], ratio_f_sm[f_sm - 1]); - -// T fc1 = lerp(f1, f2, ratio_c_sm[c_sm - 1]); -// T fc2 = lerp(f3, f4, ratio_c_sm[c_sm - 1]); - -// res = lerp(fc1, fc2, ratio_r_sm[r_sm - 1]); -// } -// } -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res; -// } -// } else if (TYPE == 2) { -// f_gl = 2 * f_gl + 1; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr - nr_c && c_gl < nc - nc_c && f_gl < nf) { -// res = dwrcf[get_idx(lddwrcf1, lddwrcf2, r_gl, c_gl, f_gl)]; -// if (!skip) { -// if (INTERPOLATION) { -// T c1 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm - 1, -// f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm + 1, -// f_sm)], ratio_c_sm[c_sm - 1]); -// T c2 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm - 1, -// f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm + 1, -// f_sm)], ratio_c_sm[c_sm - 1]); -// res = lerp(c1, c2, ratio_r_sm[r_sm - 1]); -// } -// } -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res; -// } -// } -// } - -// if (r + (R/2) * 2 == nr_p - 1) { -// if (threadId < (C/2) * (F/2)) { -// if (dwf) { -// r_sm = (R/2) * 2; -// c_sm = (threadId / (F/2)) * 2; -// f_sm = (threadId % (F/2)) * 2 + 1; -// r_gl = r / 2 + (R/2); -// c_gl = c / 2 + threadId / (F/2); -// f_gl = f / 2 + threadId % (F/2); -// if (TYPE == 1) { -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf - nf_c) { -// res = dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)]; -// if (!skip) { -// if (INTERPOLATION && COEFF_RESTORE) { -// res += lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm - -// 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm + -// 1)], ratio_f_sm[f_sm - 1]); -// } else if (INTERPOLATION && !COEFF_RESTORE) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm - 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm + 1)], -// ratio_f_sm[f_sm - 1]); -// } -// } -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res; -// } -// } else if (TYPE == 2) { -// f_gl = 2 * f_gl + 1; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf) { -// // res = dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)]; -// if (!skip) { -// if (INTERPOLATION) { -// ; -// } -// } -// // v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res; -// } -// } -// } - -// if (dwc) { -// r_sm = (R/2) * 2; -// c_sm = (threadId / (F/2)) * 2 + 1; -// f_sm = (threadId % (F/2)) * 2; -// r_gl = r / 2 + (R/2); -// c_gl = c / 2 + threadId / (F/2); -// f_gl = f / 2 + threadId % (F/2); - -// if (TYPE == 1) { -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf_c) { -// res = dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)]; -// if (!skip) { -// if (INTERPOLATION && COEFF_RESTORE) { -// res += lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, -// f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, -// f_sm)], ratio_c_sm[c_sm - 1]); -// } else if (INTERPOLATION && !COEFF_RESTORE) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm)], -// ratio_c_sm[c_sm - 1]); -// } -// } -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res; -// } -// } else if (TYPE == 2) { -// f_gl *= 2; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf) { -// res = dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)]; -// if (!skip) { -// if (INTERPOLATION) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm)], -// ratio_c_sm[c_sm - 1]); -// } -// } -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res; -// } -// } -// } -// if (dwcf) { -// r_sm = (R/2) * 2; -// c_sm = (threadId / (F/2)) * 2 + 1; -// f_sm = (threadId % (F/2)) * 2 + 1; -// r_gl = r / 2 + (R/2); -// c_gl = c / 2 + threadId / (F/2); -// f_gl = f / 2 + threadId % (F/2); -// if (TYPE == 1) { -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf - nf_c) { -// res = dwcf[get_idx(lddwcf1, lddwcf2, r_gl, c_gl, f_gl)]; -// if (!skip) { -// if (INTERPOLATION && COEFF_RESTORE) { -// T f1 = -// lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm - -// 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm + -// 1)], ratio_f_sm[f_sm - 1]); -// T f2 = -// lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm - -// 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm + -// 1)], ratio_f_sm[f_sm - 1]); -// res += lerp(f1, f2, ratio_c_sm[c_sm - 1]); -// } else if (INTERPOLATION && !COEFF_RESTORE) { -// T f1 = -// lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm - -// 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm + -// 1)], ratio_f_sm[f_sm - 1]); -// T f2 = -// lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm - -// 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm + -// 1)], ratio_f_sm[f_sm - 1]); -// res = lerp(f1, f2, ratio_c_sm[c_sm - 1]); -// } -// } - -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res; -// } -// } else if (TYPE == 2) { -// f_gl = 2 * f_gl + 1; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf) { -// res = dwcf[get_idx(lddwcf1, lddwcf2, r_gl, c_gl, f_gl)]; -// if (!skip) { -// if (INTERPOLATION) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm)], -// ratio_c_sm[c_sm - 1]); -// // if (idx[1] ==0 && idx[2] == 0) { -// // printf("%f(%d %d %d) %f(%d %d %d) -> %f\n", -// // v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, -// f_sm)], -// // r_sm, c_sm - 1, f_sm, v_sm[get_idx(ldsm1, ldsm2, -// // r_sm, c_sm + 1, f_sm)], r_sm, c_sm + 1, f_sm, -// res); -// // } -// } -// } - -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res; -// } -// } -// } -// } -// } - -// if (c + (C/2) * 2 == nc_p - 1) { -// if (threadId >= (R/2) * (C/2) * (F/2) && threadId < (R/2) * (C/2) * (F/2) -// + (R/2) * (F/2)) { -// if (dwf) { -// r_sm = ((threadId - (R/2) * (C/2) * (F/2)) / (F/2)) * 2; -// c_sm = (C/2) * 2; -// f_sm = ((threadId - (R/2) * (C/2) * (F/2)) % (F/2)) * 2 + 1; -// r_gl = r / 2 + (threadId - (R/2) * (C/2) * (F/2)) / (F/2); -// c_gl = c / 2 + (C/2); -// f_gl = f / 2 + (threadId - (R/2) * (C/2) * (F/2)) % (F/2); - -// if (TYPE == 1) { -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf - nf_c) { -// res = dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)]; -// if (!skip) { -// if (INTERPOLATION && COEFF_RESTORE) { -// res += lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm - -// 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm + -// 1)], ratio_f_sm[f_sm - 1]); -// } else if (INTERPOLATION && !COEFF_RESTORE) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm - 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm + 1)], -// ratio_f_sm[f_sm - 1]); -// } -// } -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res; -// } -// } else if (TYPE == 2) { -// f_gl = 2 * f_gl + 1; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf) { -// // res = dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)]; -// if (!skip) { -// if (INTERPOLATION) { -// ; -// } -// } -// // v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res; -// } -// } -// } -// if (dwr) { -// r_sm = ((threadId - (R/2) * (C/2) * (F/2)) / (F/2)) * 2 + 1; -// c_sm = (C/2) * 2; -// f_sm = ((threadId - (R/2) * (C/2) * (F/2)) % (F/2)) * 2; -// r_gl = r / 2 + (threadId - (R/2) * (C/2) * (F/2)) / (F/2); -// c_gl = c / 2 + (C/2); -// f_gl = f / 2 + (threadId - (R/2) * (C/2) * (F/2)) % (F/2); -// if (TYPE == 1) { -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf_c) { -// res = dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)]; -// if (!skip) { -// if (INTERPOLATION && COEFF_RESTORE) { -// res += lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, -// f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, -// f_sm)], ratio_r_sm[r_sm - 1]); -// } else if (INTERPOLATION && !COEFF_RESTORE) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm)], -// ratio_r_sm[r_sm - 1]); -// } -// } -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res; -// } -// } else if (TYPE == 2) { -// f_gl *= 2; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf) { -// res = dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)]; -// if (!skip) { -// if (INTERPOLATION) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm)], -// ratio_r_sm[r_sm - 1]); -// } -// } -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res; -// } -// } -// } -// if (dwrf) { -// r_sm = ((threadId - (R/2) * (C/2) * (F/2)) / (F/2)) * 2 + 1; -// c_sm = (C/2) * 2; -// f_sm = ((threadId - (R/2) * (C/2) * (F/2)) % (F/2)) * 2 + 1; -// r_gl = r / 2 + (threadId - (R/2) * (C/2) * (F/2)) / (F/2); -// c_gl = c / 2 + (C/2); -// f_gl = f / 2 + (threadId - (R/2) * (C/2) * (F/2)) % (F/2); - -// if (TYPE == 1) { -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf - nf_c) { -// res = dwrf[get_idx(lddwrf1, lddwrf2, r_gl, c_gl, f_gl)]; -// if (!skip) { -// if (INTERPOLATION && COEFF_RESTORE) { -// T f1 = -// lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm - -// 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm + -// 1)], ratio_f_sm[f_sm - 1]); -// T f2 = -// lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm - -// 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm + -// 1)], ratio_f_sm[f_sm - 1]); -// res += lerp(f1, f2, ratio_r_sm[r_sm - 1]); -// } else if (INTERPOLATION && !COEFF_RESTORE) { -// T f1 = -// lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm - -// 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm + -// 1)], ratio_f_sm[f_sm - 1]); -// T f2 = -// lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm - -// 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm + -// 1)], ratio_f_sm[f_sm - 1]); -// res = lerp(f1, f2, ratio_r_sm[r_sm - 1]); -// } -// } -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res; -// } -// } else if (TYPE == 2) { -// f_gl = 2 * f_gl + 1; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf) { -// res = dwrf[get_idx(lddwrf1, lddwrf2, r_gl, c_gl, f_gl)]; -// if (!skip) { -// if (INTERPOLATION) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm)], -// ratio_r_sm[r_sm - 1]); -// } -// } -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res; -// } -// } -// } -// } -// } - -// if (f + (F/2) * 2 == nf_p - 1) { -// if (threadId >= (R/2) * (C/2) * (F/2) * 2 && threadId < (R/2) * (C/2) * -// (F/2) * 2 + (R/2) * (C/2)) { -// if (dwc) { -// r_sm = ((threadId - (R/2) * (C/2) * (F/2) * 2) / (C/2)) * 2; -// c_sm = ((threadId - (R/2) * (C/2) * (F/2) * 2) % (C/2)) * 2 + 1; -// f_sm = (F/2) * 2; -// r_gl = r / 2 + (threadId - (R/2) * (C/2) * (F/2) * 2) / (C/2); -// c_gl = c / 2 + (threadId - (R/2) * (C/2) * (F/2) * 2) % (C/2); -// f_gl = f / 2 + (F/2); -// if (TYPE == 1) { -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf_c) { -// res = dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)]; -// if (!skip) { -// if (INTERPOLATION && COEFF_RESTORE) { -// res += lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, -// f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, -// f_sm)], ratio_c_sm[c_sm - 1]); -// } else if (INTERPOLATION && !COEFF_RESTORE) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm)], -// ratio_c_sm[c_sm - 1]); -// } -// } -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res; -// } -// } else if (TYPE == 2) { -// f_gl *= 2; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf) { -// res = dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)]; -// if (!skip) { -// if (INTERPOLATION) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm)], -// ratio_c_sm[c_sm - 1]); -// } -// } -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res; -// } -// } -// } - -// if (dwr) { -// r_sm = ((threadId - (R/2) * (C/2) * (F/2) * 2) / (C/2)) * 2 + 1; -// c_sm = ((threadId - (R/2) * (C/2) * (F/2) * 2) % (C/2)) * 2; -// f_sm = (F/2) * 2; -// r_gl = r / 2 + (threadId - (R/2) * (C/2) * (F/2) * 2) / (C/2); -// c_gl = c / 2 + (threadId - (R/2) * (C/2) * (F/2) * 2) % (C/2); -// f_gl = f / 2 + (F/2); -// if (TYPE == 1) { -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf_c) { -// res = dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)]; -// if (!skip) { -// if (INTERPOLATION && COEFF_RESTORE) { -// res += lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, -// f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, -// f_sm)], ratio_r_sm[r_sm - 1]); -// } else if (INTERPOLATION && !COEFF_RESTORE) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm)], -// ratio_r_sm[r_sm - 1]); -// } -// } -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res; -// } -// } else if (TYPE == 2) { -// f_gl *= 2; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf) { -// res = dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)]; -// if (!skip) { -// if (INTERPOLATION) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm)], -// ratio_r_sm[r_sm - 1]); -// } -// } -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res; -// } -// } -// } - -// if (dwrc) { -// r_sm = ((threadId - (R/2) * (C/2) * (F/2) * 2) / (C/2)) * 2 + 1; -// c_sm = ((threadId - (R/2) * (C/2) * (F/2) * 2) % (C/2)) * 2 + 1; -// f_sm = (F/2) * 2; -// r_gl = r / 2 + (threadId - (R/2) * (C/2) * (F/2) * 2) / (C/2); -// c_gl = c / 2 + (threadId - (R/2) * (C/2) * (F/2) * 2) % (C/2); -// f_gl = f / 2 + (F/2); - -// if (TYPE == 1) { -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr - nr_c && c_gl < nc - nc_c && f_gl < nf_c) { -// res = dwrc[get_idx(lddwrc1, lddwrc2, r_gl, c_gl, f_gl)]; -// if (!skip) { -// if (INTERPOLATION && COEFF_RESTORE) { -// T c1 = -// lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm - 1, -// f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm + 1, -// f_sm)], ratio_c_sm[c_sm - 1]); -// T c2 = -// lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm - 1, -// f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm + 1, -// f_sm)], ratio_c_sm[c_sm - 1]); -// res += lerp(c1, c2, ratio_r_sm[r_sm - 1]); -// } else if (INTERPOLATION && !COEFF_RESTORE) { -// T c1 = -// lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm - 1, -// f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm + 1, -// f_sm)], ratio_c_sm[c_sm - 1]); -// T c2 = -// lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm - 1, -// f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm + 1, -// f_sm)], ratio_c_sm[c_sm - 1]); -// res = lerp(c1, c2, ratio_r_sm[r_sm - 1]); -// } -// } -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res; -// } -// } else if (TYPE == 2) { -// f_gl *= 2; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr - nr_c && c_gl < nc - nc_c && f_gl < nf) { -// res = dwrc[get_idx(lddwrc1, lddwrc2, r_gl, c_gl, f_gl)]; -// if (!skip) { -// if (INTERPOLATION) { -// T c1 = -// lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm - 1, -// f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm + 1, -// f_sm)], ratio_c_sm[c_sm - 1]); -// T c2 = -// lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm - 1, -// f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm + 1, -// f_sm)], ratio_c_sm[c_sm - 1]); -// res = lerp(c1, c2, ratio_r_sm[r_sm - 1]); -// } -// } -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res; -// } -// } -// } -// } -// } - -// if (c + (C/2) * 2 == nc_p - 1 && f + (F/2) * 2 == nf_p - 1) { -// if (threadId >= (R/2) * (C/2) * (F/2) * 3 && threadId < (R/2) * (C/2) * -// (F/2) * 3 + (R/2)) { -// if (dwr) { -// r_sm = (threadId - (R/2) * (C/2) * (F/2) * 3) * 2 + 1; -// c_sm = (C/2) * 2; -// f_sm = (F/2) * 2; -// r_gl = r / 2 + threadId - (R/2) * (C/2) * (F/2) * 3; -// c_gl = c / 2 + (C/2); -// f_gl = f / 2 + (F/2); -// if (TYPE == 1) { -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf_c) { -// res = dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)]; -// if (!skip) { -// if (INTERPOLATION && COEFF_RESTORE) { -// res += lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, -// f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, -// f_sm)], ratio_r_sm[r_sm - 1]); -// } else if (INTERPOLATION && !COEFF_RESTORE) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm)], -// ratio_r_sm[r_sm - 1]); -// } -// } -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res; -// } -// } else if (TYPE == 2) { -// f_gl *= 2; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf) { -// res = dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)]; -// if (!skip) { -// if (INTERPOLATION) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm)], -// ratio_r_sm[r_sm - 1]); -// } -// } -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res; -// } -// } -// } -// } -// } - -// if (r + (R/2) * 2 == nr_p - 1 && f + (F/2) * 2 == nf_p - 1) { -// if (threadId >= (R/2) * (C/2) * (F/2) * 4 && threadId < (R/2) * (C/2) * -// (F/2) * 4 + (C/2)) { -// if (dwc) { -// r_sm = (R/2) * 2; -// c_sm = (threadId - (R/2) * (C/2) * (F/2) * 4) * 2 + 1; -// f_sm = (F/2) * 2; -// r_gl = r / 2 + (R/2); -// c_gl = c / 2 + threadId - (R/2) * (C/2) * (F/2) * 4; -// f_gl = f / 2 + (F/2); -// if (TYPE == 1) { -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf_c) { -// res = dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)]; -// if (!skip) { -// if (INTERPOLATION && COEFF_RESTORE) { -// res += lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, -// f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, -// f_sm)], ratio_c_sm[c_sm - 1]); -// } else if (INTERPOLATION && !COEFF_RESTORE) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm)], -// ratio_c_sm[c_sm - 1]); -// } -// } -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res; -// } -// } else if (TYPE == 2) { -// f_gl *= 2; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf) { -// res = dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)]; -// if (!skip) { -// if (INTERPOLATION) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm)], -// ratio_c_sm[c_sm - 1]); -// } -// } -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res; -// } -// } -// } -// } -// } - -// if (r + (R/2) * 2 == nr_p - 1 && c + (C/2) * 2 == nc_p - 1) { -// if (threadId >= (R/2) * (C/2) * (F/2) * 5 && threadId < (R/2) * (C/2) * -// (F/2) * 5 + (F/2)) { -// if (dwf) { -// r_sm = (R/2) * 2; -// c_sm = (C/2) * 2; -// f_sm = (threadId - (R/2) * (C/2) * (F/2) * 5) * 2 + 1; -// r_gl = r / 2 + (R/2); -// c_gl = c / 2 + (C/2); -// f_gl = f / 2 + threadId - (R/2) * (C/2) * (F/2) * 5; -// if (TYPE == 1) { -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf - nf_c) { -// res = dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)]; -// if (!skip) { -// if (INTERPOLATION && COEFF_RESTORE) { -// res += lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm - -// 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm + -// 1)], ratio_f_sm[f_sm - 1]); -// } else if (INTERPOLATION && !COEFF_RESTORE) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm - 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm + 1)], -// ratio_f_sm[f_sm - 1]); -// } -// } -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res; -// } -// } else if (TYPE == 2) { -// f_gl = 2 * f_gl + 1; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf) { -// // res = dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)]; -// if (!skip) { -// if (INTERPOLATION) { -// ; -// } -// } -// // v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res; -// } -// } -// } -// } -// } - -// // __syncthreads(); -// // if (debug) { -// // printf("TYPE: %d %d %d %d\n", TYPE, min(rest_r_p, (R/2) * 2 + 1), -// // min(rest_c_p, (C/2) * 2 + 1), min(rest_f_p, (F/2) * 2 + 1)); -// // for (int i = 0; i < min(rest_r_p, (R/2) * 2 + 1); i++) { -// // for (int j = 0; j < min(rest_c_p, (C/2) * 2 + 1); j++) { -// // for (int k = 0; k < min(rest_f_p, (F/2) * 2 + 1); k++) { -// // printf("%2.2f ", v_sm[get_idx(ldsm1, ldsm2, i, j, k)]); -// // } -// // printf("\n"); -// // } -// // printf("\n"); -// // } -// // } -// // __syncthreads(); - -// __syncthreads(); - -// r_sm = threadIdx.z; -// c_sm = threadIdx.y; -// f_sm = threadIdx.x; - -// r_sm_ex = FunctorBase::GetBlockDimZ(); -// c_sm_ex = FunctorBase::GetBlockDimY(); -// f_sm_ex = FunctorBase::GetBlockDimX(); - -// r_gl = r + r_sm; -// c_gl = c + c_sm; -// f_gl = f + f_sm; - -// // r_gl_ex = r + (R/2) * 2; -// // c_gl_ex = c + (C/2) * 2; -// // f_gl_ex = f + (F/2) * 2; - -// r_gl_ex = r + rest_r - 1; -// c_gl_ex = c + rest_c - 1; -// f_gl_ex = f + rest_f - 1; - -// int unpadding_r = rest_r; -// int unpadding_c = rest_c; -// int unpadding_f = rest_f; -// if (nr % 2 == 0) -// unpadding_r -= 1; -// if (nc % 2 == 0) -// unpadding_c -= 1; -// if (TYPE == 1 && nf % 2 == 0) -// unpadding_f -= 1; - -// if (r_sm < unpadding_r && c_sm < unpadding_c && f_sm < unpadding_f) { - -// // store extra rules -// // case 1: input = odd (non-padding required) -// // case 1.a: block size + 1 == rest (need to store extra); -// // case 1.b: block size + 1 != rest (No need to store extra); -// // case 2: input = even (un-padding requried) -// // case 2.a: block size + 1 >= rest (No need to store extra, but need -// // un-padding first); case 2.b: block size + 1 < rest (No need to -// store -// // extra); - -// if (D_LOCAL >= 3 && r_sm == 0) { -// if (nr % 2 != 0 && (R/2) * 2 + 1 == rest_r) { -// if (!INTERPOLATION && COEFF_RESTORE) { -// dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl, f_gl)] += -// v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm, f_sm)]; -// } else { -// dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl, f_gl)] = -// v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm, f_sm)]; -// } -// } -// if (nr % 2 == 0 && (R/2) * 2 + 1 >= rest_r_p) { -// v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, c_sm, f_sm)] = -// v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm, f_sm)]; -// } -// } - -// if (D_LOCAL >= 2 && c_sm == 0) { -// if (nc % 2 != 0 && (C/2) * 2 + 1 == rest_c) { -// if (!INTERPOLATION && COEFF_RESTORE) { -// dv[get_idx(lddv1, lddv2, r_gl, c_gl_ex, f_gl)] += -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm_ex, f_sm)]; -// } else { -// dv[get_idx(lddv1, lddv2, r_gl, c_gl_ex, f_gl)] = -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm_ex, f_sm)]; -// } -// } -// if (nc % 2 == 0 && (C/2) * 2 + 1 >= rest_c_p) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, rest_c - 1, f_sm)] = -// v_sm[get_idx(ldsm1, ldsm2, r_sm, rest_c_p - 1, f_sm)]; -// } -// } - -// if (D_LOCAL >= 1 && f_sm == 0) { -// if (nf % 2 != 0 && (F/2) * 2 + 1 == rest_f) { -// if (!INTERPOLATION && COEFF_RESTORE) { -// dv[get_idx(lddv1, lddv2, r_gl, c_gl, f_gl_ex)] += -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm_ex)]; -// } else { -// dv[get_idx(lddv1, lddv2, r_gl, c_gl, f_gl_ex)] = -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm_ex)]; -// } -// } -// if (nf % 2 == 0 && (F/2) * 2 + 1 >= rest_f_p && TYPE == 1) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, rest_f - 1)] = -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, rest_f_p - 1)]; -// } -// } - -// // load extra edges -// if (D_LOCAL >= 2 && c_sm == 0 && f_sm == 0) { -// if (nc % 2 != 0 && (C/2) * 2 + 1 == rest_c && nf % 2 != 0 && -// (F/2) * 2 + 1 == rest_f) { -// if (!INTERPOLATION && COEFF_RESTORE) { -// dv[get_idx(lddv1, lddv2, r_gl, c_gl_ex, f_gl_ex)] += -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm_ex, f_sm_ex)]; -// } else { -// dv[get_idx(lddv1, lddv2, r_gl, c_gl_ex, f_gl_ex)] = -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm_ex, f_sm_ex)]; -// } -// } -// if (nc % 2 == 0 && nf % 2 == 0 && (C/2) * 2 + 1 >= rest_c_p && -// (F/2) * 2 + 1 >= rest_f_p && TYPE == 1) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, rest_c - 1, rest_f - 1)] = -// v_sm[get_idx(ldsm1, ldsm2, r_sm, rest_c_p - 1, rest_f_p - 1)]; -// } -// if (nc % 2 == 0 && nf % 2 != 0 && (C/2) * 2 + 1 >= rest_c_p && -// (F/2) * 2 + 1 == rest_f) { -// if (!INTERPOLATION && COEFF_RESTORE) { -// dv[get_idx(lddv1, lddv2, r_gl, c_gl_ex, f_gl_ex)] += -// v_sm[get_idx(ldsm1, ldsm2, r_sm, rest_c_p - 1, f_sm_ex)]; -// } else { -// dv[get_idx(lddv1, lddv2, r_gl, c_gl_ex, f_gl_ex)] = -// v_sm[get_idx(ldsm1, ldsm2, r_sm, rest_c_p - 1, f_sm_ex)]; -// } -// } -// if (nc % 2 != 0 && nf % 2 == 0 && (C/2) * 2 + 1 == rest_c && -// (F/2) * 2 + 1 >= rest_f_p && TYPE == 1) { -// if (!INTERPOLATION && COEFF_RESTORE) { -// dv[get_idx(lddv1, lddv2, r_gl, c_gl_ex, f_gl_ex)] += -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm_ex, rest_f_p - 1)]; -// } else { -// dv[get_idx(lddv1, lddv2, r_gl, c_gl_ex, f_gl_ex)] = -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm_ex, rest_f_p - 1)]; -// // printf("(%d %d %d): %f <- (%d %d %d)\n", -// // r_gl, c_gl_ex, f_gl_ex, -// // dv[get_idx(lddv1, lddv2, r_gl, c_gl_ex, f_gl_ex)], -// // r_sm, c_sm_ex, f_gl_ex); -// } -// } -// } - -// if (D_LOCAL >= 3 && r_sm == 0 && f_sm == 0) { -// if (nr % 2 != 0 && (R/2) * 2 + 1 == rest_r && nf % 2 != 0 && -// (F/2) * 2 + 1 == rest_f) { -// if (!INTERPOLATION && COEFF_RESTORE) { -// dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl, f_gl_ex)] += -// v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm, f_sm_ex)]; -// } else { -// dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl, f_gl_ex)] = -// v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm, f_sm_ex)]; -// } -// } -// if (nr % 2 == 0 && nf % 2 == 0 && (R/2) * 2 + 1 >= rest_r_p && -// (F/2) * 2 + 1 >= rest_f_p && TYPE == 1) { -// v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, c_sm, rest_f - 1)] = -// v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm, rest_f_p - 1)]; -// } -// if (nr % 2 == 0 && nf % 2 != 0 && (R/2) * 2 + 1 >= rest_r_p && -// (F/2) * 2 + 1 == rest_f) { -// if (!INTERPOLATION && COEFF_RESTORE) { -// dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl, f_gl_ex)] += -// v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm, f_sm_ex)]; -// } else { -// dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl, f_gl_ex)] = -// v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm, f_sm_ex)]; -// } -// } -// if (nr % 2 != 0 && nf % 2 == 0 && (R/2) * 2 + 1 == rest_r && -// (F/2) * 2 + 1 >= rest_f_p && TYPE == 1) { -// if (!INTERPOLATION && COEFF_RESTORE) { -// dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl, f_gl_ex)] += -// v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm, rest_f_p - 1)]; -// } else { -// dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl, f_gl_ex)] = -// v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm, rest_f_p - 1)]; -// // printf("(%d %d %d): %f <- (%d %d %d)\n", -// // r_gl_ex, c_gl, rest_f-1, -// // dv[get_idx(lddv1, lddv2, r_gl_ex-1, c_gl, f_gl_ex)], -// // r_sm_ex, c_sm, rest_f_p-1); -// } -// } -// } - -// if (D_LOCAL >= 3 && r_sm == 0 && c_sm == 0) { -// if (nr % 2 != 0 && (R/2) * 2 + 1 == rest_r && nc % 2 != 0 && -// (C/2) * 2 + 1 == rest_c) { -// if (!INTERPOLATION && COEFF_RESTORE) { -// dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl)] += -// v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm_ex, f_sm)]; -// } else { -// dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl)] = -// v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm_ex, f_sm)]; -// } -// } -// if (nr % 2 == 0 && nc % 2 == 0 && (R/2) * 2 + 1 >= rest_r_p && -// (C/2) * 2 + 1 >= rest_c_p) { -// v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, rest_c - 1, f_sm)] = -// v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, rest_c_p - 1, f_sm)]; -// } -// if (nr % 2 == 0 && nc % 2 != 0 && (R/2) * 2 + 1 >= rest_r_p && -// (C/2) * 2 + 1 == rest_c) { -// if (!INTERPOLATION && COEFF_RESTORE) { -// dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl)] += -// v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm_ex, f_sm)]; -// } else { -// dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl)] = -// v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm_ex, f_sm)]; -// } -// } -// if (nr % 2 != 0 && nc % 2 == 0 && (R/2) * 2 + 1 == rest_r && -// (C/2) * 2 + 1 >= rest_c_p) { -// if (!INTERPOLATION && COEFF_RESTORE) { -// dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl)] += -// v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, rest_c_p - 1, f_sm)]; -// } else { -// dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl)] = -// v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, rest_c_p - 1, f_sm)]; -// } -// } -// } -// // load extra vertex - -// if (D_LOCAL >= 3 && r_sm == 0 && c_sm == 0 && f_sm == 0) { -// if (nr % 2 != 0 && (R/2) * 2 + 1 == rest_r && nc % 2 != 0 && -// (C/2) * 2 + 1 == rest_c && nf % 2 != 0 && (F/2) * 2 + 1 == rest_f) -// { -// if (!INTERPOLATION && COEFF_RESTORE) { -// dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl_ex)] += -// v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm_ex, f_sm_ex)]; -// } else { -// dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl_ex)] = -// v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm_ex, f_sm_ex)]; -// } -// } - -// if (nr % 2 == 0 && nc % 2 == 0 && nf % 2 == 0 && (R/2) * 2 + 1 >= -// rest_r_p && -// (C/2) * 2 + 1 >= rest_c_p && (F/2) * 2 + 1 >= rest_f_p && TYPE == -// 1) { -// v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, rest_c - 1, rest_f - 1)] = -// v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, rest_c_p - 1, -// rest_f_p - 1)]; -// } -// if (nr % 2 == 0 && nc % 2 == 0 && nf % 2 != 0 && (R/2) * 2 + 1 >= -// rest_r_p && -// (C/2) * 2 + 1 >= rest_c_p && (F/2) * 2 + 1 == rest_f) { -// if (!INTERPOLATION && COEFF_RESTORE) { -// dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl_ex)] += -// v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, rest_c_p - 1, -// f_sm_ex)]; -// } else { -// dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl_ex)] = -// v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, rest_c_p - 1, -// f_sm_ex)]; -// } -// } -// if (nr % 2 == 0 && nc % 2 != 0 && nf % 2 == 0 && (R/2) * 2 + 1 >= -// rest_r_p && -// (C/2) * 2 + 1 == rest_c && (F/2) * 2 + 1 >= rest_f_p && TYPE == 1) -// { -// if (!INTERPOLATION && COEFF_RESTORE) { -// dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl_ex)] += -// v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm_ex, rest_f_p - -// 1)]; -// } else { -// dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl_ex)] = -// v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm_ex, rest_f_p - -// 1)]; -// } -// } -// if (nr % 2 != 0 && nc % 2 == 0 && nf % 2 == 0 && (R/2) * 2 + 1 == -// rest_r && -// (C/2) * 2 + 1 >= rest_c_p && (F/2) * 2 + 1 >= rest_f_p && TYPE == -// 1) { -// if (!INTERPOLATION && COEFF_RESTORE) { -// dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl_ex)] += -// v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, rest_c_p - 1, rest_f_p - -// 1)]; -// } else { -// dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl_ex)] = -// v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, rest_c_p - 1, rest_f_p - -// 1)]; -// } -// } -// if (nr % 2 == 0 && nc % 2 != 0 && nf % 2 != 0 && (R/2) * 2 + 1 >= -// rest_r_p && -// (C/2) * 2 + 1 == rest_c && (F/2) * 2 + 1 == rest_f) { -// if (!INTERPOLATION && COEFF_RESTORE) { -// dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl_ex)] += -// v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm_ex, f_sm_ex)]; -// } else { -// dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl_ex)] = -// v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm_ex, f_sm_ex)]; -// } -// } -// if (nr % 2 != 0 && nc % 2 == 0 && nf % 2 != 0 && (R/2) * 2 + 1 == -// rest_r && -// (C/2) * 2 + 1 >= rest_c_p && (F/2) * 2 + 1 == rest_f) { -// if (!INTERPOLATION && COEFF_RESTORE) { -// dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl_ex)] += -// v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, rest_c_p - 1, f_sm_ex)]; -// } else { -// dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl_ex)] = -// v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, rest_c_p - 1, f_sm_ex)]; -// } -// } -// if (nr % 2 != 0 && nc % 2 != 0 && nf % 2 == 0 && (R/2) * 2 + 1 == -// rest_r && -// (C/2) * 2 + 1 == rest_c && (F/2) * 2 + 1 >= rest_f_p && TYPE == 1) -// { -// if (!INTERPOLATION && COEFF_RESTORE) { -// dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl_ex)] += -// v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm_ex, rest_f_p - 1)]; -// } else { -// dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl_ex)] = -// v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm_ex, rest_f_p - 1)]; -// } -// } -// } -// } - -// __syncthreads(); - -// if (r_sm < rest_r && c_sm < rest_c && f_sm < rest_f) { -// if (r_gl >= svr && r_gl < svr + nvr && c_gl >= svc && c_gl < svc + nvc && -// f_gl >= svf && f_gl < svf + nvf) { -// if (!INTERPOLATION && COEFF_RESTORE) { -// dv[get_idx(lddv1, lddv2, r_gl, c_gl, f_gl)] += -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)]; -// } else { -// dv[get_idx(lddv1, lddv2, r_gl, c_gl, f_gl)] = -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)]; -// } -// } -// } -// } - -// template -// void gpk_rev_adaptive_launcher( -// Handle &handle, SIZE *shape_h, SIZE *shape_d, SIZE -// *shape_c_d, SIZE *ldvs, SIZE *ldws, DIM unprocessed_n, DIM -// *unprocessed_dims, DIM curr_dim_r, DIM curr_dim_c, DIM curr_dim_f, T -// *dratio_r, T *dratio_c, T *dratio_f, T *dv, LENGTH lddv1, LENGTH lddv2, T -// *dw, LENGTH lddw1, LENGTH lddw2, T *dwf, LENGTH lddwf1, LENGTH lddwf2, T -// *dwc, LENGTH lddwc1, LENGTH lddwc2, T *dwr, LENGTH lddwr1, LENGTH lddwr2, -// T *dwcf, LENGTH lddwcf1, LENGTH lddwcf2, T *dwrf, LENGTH lddwrf1, LENGTH -// lddwrf2, T *dwrc, LENGTH lddwrc1, LENGTH lddwrc2, T *dwrcf, LENGTH -// lddwrcf1, LENGTH lddwrcf2, SIZE svr, SIZE svc, SIZE svf, SIZE nvr, SIZE -// nvc, SIZE nvf, int queue_idx) { - -// SIZE nr = shape_h[curr_dim_r]; -// SIZE nc = shape_h[curr_dim_c]; -// SIZE nf = shape_h[curr_dim_f]; -// if (D_LOCAL == 2) { -// nr = 1; -// } -// SIZE total_thread_z = std::max(nr - 1, (SIZE)1); -// SIZE total_thread_y = std::max(nc - 1, (SIZE)1); -// SIZE total_thread_x = std::max(nf - 1, (SIZE)1); - -// SIZE tbx, tby, tbz, gridx, gridy, gridz; -// dim3 threadsPerBlock, blockPerGrid; -// size_t sm_size; - -// // tbz = std::min(R, total_thread_z); -// // tby = std::min(C, total_thread_y); -// // tbx = std::min(F, total_thread_x); -// tbz = R; -// tby = C; -// tbx = F; -// sm_size = ((R + 1) * (C + 1) * (F + 1) + R + C + F) * sizeof(T); -// sm_size += (D_GLOBAL * 4) * sizeof(SIZE); -// sm_size += (D_GLOBAL * 1) * sizeof(DIM); - -// gridz = ceil((float)total_thread_z / tbz); -// gridy = ceil((float)total_thread_y / tby); -// gridx = ceil((float)total_thread_x / tbx); -// for (DIM d = 0; d < D_GLOBAL; d++) { -// if (D_LOCAL == 3 && d != curr_dim_f && d != curr_dim_c && d != -// curr_dim_r) { -// gridx *= shape_h[d]; -// } -// if (D_LOCAL == 2 && d != curr_dim_f && d != curr_dim_c) { -// gridx *= shape_h[d]; -// } -// } - -// threadsPerBlock = dim3(tbx, tby, tbz); -// blockPerGrid = dim3(gridx, gridy, gridz); - -// // printf("gpk_rev exec: %d %d %d %d %d %d\n", tbx, tby, tbz, gridx, gridy, -// // gridz); -// _gpk_rev<<>>( -// shape_d, shape_c_d, ldvs, ldws, unprocessed_n, unprocessed_dims, -// curr_dim_r, curr_dim_c, curr_dim_f, dratio_r, dratio_c, dratio_f, dv, -// lddv1, lddv2, dw, lddw1, lddw2, dwf, lddwf1, lddwf2, dwc, lddwc1, -// lddwc2, dwr, lddwr1, lddwr2, dwcf, lddwcf1, lddwcf2, dwrf, lddwrf1, -// lddwrf2, dwrc, lddwrc1, lddwrc2, dwrcf, lddwrcf1, lddwrcf2, svr, svc, -// svf, nvr, nvc, nvf); -// gpuErrchk(cudaGetLastError()); -// if (handle.sync_and_check_all_kernels) { -// gpuErrchk(cudaDeviceSynchronize()); -// } -// } - -// template -// void gpk_rev(Handle &handle, SIZE *shape_h, SIZE *shape_d, -// SIZE *shape_c_d, SIZE *ldvs, SIZE *ldws, DIM unprocessed_n, -// DIM *unprocessed_dims, DIM curr_dim_r, DIM curr_dim_c, -// DIM curr_dim_f, T *dratio_r, T *dratio_c, T *dratio_f, T *dv, -// LENGTH lddv1, LENGTH lddv2, T *dw, LENGTH lddw1, LENGTH lddw2, T -// *dwf, LENGTH lddwf1, LENGTH lddwf2, T *dwc, LENGTH lddwc1, -// LENGTH lddwc2, T *dwr, LENGTH lddwr1, LENGTH lddwr2, T *dwcf, -// LENGTH lddwcf1, LENGTH lddwcf2, T *dwrf, LENGTH lddwrf1, LENGTH -// lddwrf2, T *dwrc, LENGTH lddwrc1, LENGTH lddwrc2, T *dwrcf, -// LENGTH lddwrcf1, LENGTH lddwrcf2, SIZE svr, SIZE svc, SIZE svf, -// SIZE nvr, SIZE nvc, SIZE nvf, int queue_idx, int config) { - -// #define GPK(R, C, F) \ -// { \ -// gpk_rev_adaptive_launcher( \ -// handle, shape_h, shape_d, shape_c_d, ldvs, ldws, unprocessed_n, \ -// unprocessed_dims, curr_dim_r, curr_dim_c, curr_dim_f, dratio_r, \ -// dratio_c, dratio_f, dv, lddv1, lddv2, dw, lddw1, lddw2, dwf, lddwf1, \ -// lddwf2, dwc, lddwc1, lddwc2, dwr, lddwr1, lddwr2, dwcf, lddwcf1, \ -// lddwcf2, dwrf, lddwrf1, lddwrf2, dwrc, lddwrc1, lddwrc2, dwrcf, \ -// lddwrcf1, lddwrcf2, svr, svc, svf, nvr, nvc, nvf, queue_idx); \ -// } -// bool profile = false; -// if (handle.profile_kernels) { -// profile = true; -// } -// if (D_LOCAL == 3) { -// // if (profile || config == 6) { -// // GPK(2, 2, 128) -// // } -// // if (profile || config == 5) { -// // GPK(2, 2, 64) -// // } -// // if (profile || config == 4) { -// // GPK(4, 4, 32) -// // } -// // if (profile || config == 3) { -// // GPK(4, 4, 16) -// // } -// // if (profile || config == 2) { -// // GPK(4, 4, 8) -// // } -// // if (profile || config == 1) { -// GPK(4, 4, 4) -// // } -// // if (profile || config == 0) { -// // GPK(4, 4, 4) -// // } -// } else if (D_LOCAL == 2) { -// if (profile || config == 6) { -// GPK(1, 2, 128) -// } -// if (profile || config == 5) { -// GPK(1, 2, 64) -// } -// if (profile || config == 4) { -// GPK(1, 4, 32) -// } -// if (profile || config == 3) { -// GPK(1, 4, 16) -// } -// if (profile || config == 2) { -// GPK(1, 4, 8) -// } -// if (profile || config == 1) { -// GPK(1, 4, 4) -// } -// if (profile || config == 0) { -// GPK(1, 2, 4) -// } -// } else if (D_LOCAL == 1) { -// if (profile || config == 6) { -// GPK(1, 1, 128) -// } -// if (profile || config == 5) { -// GPK(1, 1, 64) -// } -// if (profile || config == 4) { -// GPK(1, 1, 32) -// } -// if (profile || config == 3) { -// GPK(1, 1, 16) -// } -// if (profile || config == 2) { -// GPK(1, 1, 8) -// } -// if (profile || config == 1) { -// GPK(1, 1, 8) -// } -// if (profile || config == 0) { -// GPK(1, 1, 8) -// } -// } -// #undef GPK -// } - } // namespace mgard_x #endif \ No newline at end of file diff --git a/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/GridProcessingKernel3D.h b/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/GridProcessingKernel3D.h deleted file mode 100644 index 4769f7cdef..0000000000 --- a/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/GridProcessingKernel3D.h +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#ifndef MGARD_X_GRID_PROCESSING_KERNEL_3D -#define MGARD_X_GRID_PROCESSING_KERNEL_3D - -#include "../../Common.h" - -namespace mgard_x { - -template -void gpk_reo_3d(Handle &handle, SIZE nr, SIZE nc, SIZE nf, T *dratio_r, - T *dratio_c, T *dratio_f, T *dv, SIZE lddv1, SIZE lddv2, T *dw, - SIZE lddw1, SIZE lddw2, T *dwf, SIZE lddwf1, SIZE lddwf2, - T *dwc, SIZE lddwc1, SIZE lddwc2, T *dwr, SIZE lddwr1, - SIZE lddwr2, T *dwcf, SIZE lddwcf1, SIZE lddwcf2, T *dwrf, - SIZE lddwrf1, SIZE lddwrf2, T *dwrc, SIZE lddwrc1, SIZE lddwrc2, - T *dwrcf, SIZE lddwrcf1, SIZE lddwrcf2, int queue_idx, - int config); - -template -void gpk_rev_3d(Handle &handle, SIZE nr, SIZE nc, SIZE nf, T *dratio_r, - T *dratio_c, T *dratio_f, T *dv, SIZE lddv1, SIZE lddv2, T *dw, - SIZE lddw1, SIZE lddw2, T *dwf, SIZE lddwf1, SIZE lddwf2, - T *dwc, SIZE lddwc1, SIZE lddwc2, T *dwr, SIZE lddwr1, - SIZE lddwr2, T *dwcf, SIZE lddwcf1, SIZE lddwcf2, T *dwrf, - SIZE lddwrf1, SIZE lddwrf2, T *dwrc, SIZE lddwrc1, SIZE lddwrc2, - T *dwrcf, SIZE lddwrcf1, SIZE lddwrcf2, SIZE svr, SIZE svc, - SIZE svf, SIZE nvr, SIZE nvc, SIZE nvf, int queue_idx, - int config); - -} // namespace mgard_x - -#endif \ No newline at end of file diff --git a/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/GridProcessingKernel3D.hpp b/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/GridProcessingKernel3D.hpp index 1c40a28602..8e2a981bea 100644 --- a/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/GridProcessingKernel3D.hpp +++ b/include/mgard-x/DataRefactoring/MultiDimension/Coefficient/GridProcessingKernel3D.hpp @@ -1227,15 +1227,12 @@ class GpkReo3D : public AutoTuner { SubArray wrc, SubArray wrcf, int queue_idx) { int range_l = std::min(6, (int)std::log2(nf) - 1); - int arch = DeviceRuntime::GetArchitectureGeneration(); int prec = TypeToIdx(); - // int config = - // AutoTuner::autoTuningTable.auto_tuning_cc[arch][prec][range_l]; int config = AutoTuner::autoTuningTable.gpk_reo_3d[prec][range_l]; - double min_time = std::numeric_limits::max(); int min_config = 0; + ExecutionReturn ret; #define GPK(CONFIG) \ if (config == CONFIG || AutoTuner::ProfileKernels) { \ @@ -1248,22 +1245,26 @@ class GpkReo3D : public AutoTuner { ratio_c, ratio_f, v, w, wf, wc, wr, wcf, \ wrf, wrc, wrcf, queue_idx); \ DeviceAdapter adapter; \ - ExecutionReturn ret = adapter.Execute(task); \ + ret = adapter.Execute(task); \ if (AutoTuner::ProfileKernels) { \ - if (min_time > ret.execution_time) { \ + if (ret.success && min_time > ret.execution_time) { \ min_time = ret.execution_time; \ min_config = CONFIG; \ } \ } \ } - GPK(0) - GPK(1) - GPK(2) - GPK(3) - GPK(4) - GPK(5) - GPK(6) + GPK(6) if (!ret.success) config--; + GPK(5) if (!ret.success) config--; + GPK(4) if (!ret.success) config--; + GPK(3) if (!ret.success) config--; + GPK(2) if (!ret.success) config--; + GPK(1) if (!ret.success) config--; + GPK(0) if (!ret.success) config--; + if (config < 0 && !ret.success) { + std::cout << log::log_err << "no suitable config for GpkReo3D.\n"; + exit(-1); + } #undef GPK if (AutoTuner::ProfileKernels) { @@ -2444,15 +2445,12 @@ class GpkRev3D : public AutoTuner { SIZE svr, SIZE svc, SIZE svf, SIZE nvr, SIZE nvc, SIZE nvf, int queue_idx) { int range_l = std::min(6, (int)std::log2(nf) - 1); - int arch = DeviceRuntime::GetArchitectureGeneration(); int prec = TypeToIdx(); - // int config = - // AutoTuner::autoTuningTable.auto_tuning_cc[arch][prec][range_l]; int config = AutoTuner::autoTuningTable.gpk_rev_3d[prec][range_l]; - double min_time = std::numeric_limits::max(); int min_config = 0; + ExecutionReturn ret; #define GPK(CONFIG) \ if (config == CONFIG || AutoTuner::ProfileKernels) { \ @@ -2465,22 +2463,26 @@ class GpkRev3D : public AutoTuner { nr, nc, nf, nr_c, nc_c, nf_c, ratio_r, ratio_c, ratio_f, v, w, wf, wc, \ wr, wcf, wrf, wrc, wrcf, svr, svc, svf, nvr, nvc, nvf, queue_idx); \ DeviceAdapter adapter; \ - ExecutionReturn ret = adapter.Execute(task); \ + ret = adapter.Execute(task); \ if (AutoTuner::ProfileKernels) { \ - if (min_time > ret.execution_time) { \ + if (ret.success && min_time > ret.execution_time) { \ min_time = ret.execution_time; \ min_config = CONFIG; \ } \ } \ } - GPK(0) - GPK(1) - GPK(2) - GPK(3) - GPK(4) - GPK(5) - GPK(6) + GPK(6) if (!ret.success) config--; + GPK(5) if (!ret.success) config--; + GPK(4) if (!ret.success) config--; + GPK(3) if (!ret.success) config--; + GPK(2) if (!ret.success) config--; + GPK(1) if (!ret.success) config--; + GPK(0) if (!ret.success) config--; + if (config < 0 && !ret.success) { + std::cout << log::log_err << "no suitable config for GpkRev3D.\n"; + exit(-1); + } #undef GPK if (AutoTuner::ProfileKernels) { @@ -2489,2444 +2491,6 @@ class GpkRev3D : public AutoTuner { } }; -// template -// MGARDX_EXEC void -// __gpk_reo_3d(IDX ngridz, IDX ngridy, IDX ngridx, -// IDX nblockz, IDX nblocky, IDX nblockx, -// IDX blockz, IDX blocky, IDX blockx, -// IDX threadz, IDX thready, IDX threadx, -// SIZE nr, SIZE nc, SIZE nf, -// SIZE nr_c, SIZE nc_c, SIZE nf_c, -// T *dratio_r, -// T *dratio_c, T *dratio_f, -// T *dv, SIZE lddv1, SIZE lddv2, -// T *dw, SIZE lddw1, SIZE lddw2, -// T *dwf, SIZE lddwf1, SIZE lddwf2, -// T *dwc, SIZE lddwc1, SIZE lddwc2, -// T *dwr, SIZE lddwr1, SIZE lddwr2, -// T *dwcf, SIZE lddwcf1, SIZE lddwcf2, -// T *dwrf, SIZE lddwrf1, SIZE lddwrf2, -// T *dwrc, SIZE lddwrc1, SIZE lddwrc2, -// T *dwrcf, SIZE lddwrcf1, SIZE lddwrcf2) { - -// // // to be removed -// int TYPE = 1; -// bool INTERPOLATION = true; -// bool CALC_COEFF = true; -// bool in_next = false; -// bool skip = false; - -// SIZE r, c, f; -// SIZE rest_r, rest_c, rest_f; -// SIZE nr_p, nc_p, nf_p; -// SIZE rest_r_p, rest_c_p, rest_f_p; -// SIZE r_sm, c_sm, f_sm; -// SIZE r_sm_ex, c_sm_ex, f_sm_ex; -// SIZE r_gl, c_gl, f_gl; -// SIZE r_gl_ex, c_gl_ex, f_gl_ex; -// LENGTH threadId; - -// T res; - -// // r = blockIdx.z * blockDim.z; -// // c = blockIdx.y * blockDim.y; -// // f = blockIdx.x * blockDim.x; - -// r = blockz * nblockz; -// c = blocky * nblocky; -// f = blockx * nblockx; - -// rest_r = nr - r; -// rest_c = nc - c; -// rest_f = nf - f; - -// nr_p = nr; -// nc_p = nc; -// nf_p = nf; - -// rest_r_p = rest_r; -// rest_c_p = rest_c; -// rest_f_p = rest_f; - -// if (nr % 2 == 0) { -// nr_p = nr + 1; -// rest_r_p = nr_p - r; -// } -// if (nc % 2 == 0) { -// nc_p = nc + 1; -// rest_c_p = nc_p - c; -// } -// if (nf % 2 == 0) { -// nf_p = nf + 1; -// rest_f_p = nf_p - f; -// } - -// // r_sm = threadIdx.z; -// // c_sm = threadIdx.y; -// // f_sm = threadIdx.x; - -// r_sm = threadz; -// c_sm = thready; -// f_sm = threadx; - -// r_sm_ex = R * 2; -// c_sm_ex = C * 2; -// f_sm_ex = F * 2; - -// // threadId = (threadIdx.z * (blockDim.x * blockDim.y)) + -// // (threadIdx.y * blockDim.x) + threadIdx.x; - -// threadId = (threadz * (nblockx * nblocky)) + -// (thready * nblockx) + threadx; - -// T *sm = SharedMemory(); -// SIZE ldsm1 = F * 2 + 1; -// SIZE ldsm2 = C * 2 + 1; -// T *v_sm = sm; -// T *ratio_f_sm = sm + (F * 2 + 1) * (C * 2 + 1) * (R * 2 + 1); -// T *ratio_c_sm = ratio_f_sm + F * 2; -// T *ratio_r_sm = ratio_c_sm + C * 2; - -// r_gl = r + r_sm; -// r_gl_ex = r + R * 2; -// c_gl = c + c_sm; -// c_gl_ex = c + C * 2; -// f_gl = f + f_sm; -// f_gl_ex = f + F * 2; - -// // __syncthreads(); -// // if (r_sm == 0 && c_sm == 0 && f_sm == 0) { -// // //printf("setting zeros\n"); -// // for (int i = 0; i < R * 2 + 1; i++) { -// // for (int j = 0; j < C * 2 + 1; j++) { -// // for (int k = 0; k < F * 2 + 1; k++) { -// // v_sm[get_idx(ldsm1, ldsm2, i, j, k)] = 0.0; -// // } -// // } -// // } -// // //printf("done zeros\n"); -// // } -// // __syncthreads(); -// /* Load v */ -// // loading extra rules -// // case 1: input = odd (non-padding required) -// // case 1.a: block size < rest (need to load extra); -// // case 1.b: block size > rest (NO need to load extra); -// // case 2: input = even (padding requried) -// // case 2.a: block size < rest (need to load extra); -// // case 2.b: block size >= rest (NO need to load extra, but need -// // padding); - -// // Load from dv -// if (r_sm < rest_r && c_sm < rest_c && f_sm < rest_f) { - -// // load cubic -// // asm volatile("membar.cta;"); -// // start = clock64(); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = -// dv[get_idx(lddv1, lddv2, r_gl, c_gl, f_gl)]; -// // if (blockIdx.x==0 && blockIdx.y==0&&blockIdx.z==0) { -// // printf("load (%d %d %d) %f <- %d+(%d %d %d) (ld: %d %d)\n", -// // r_sm, c_sm, f_sm, -// // dv[get_idx(lddv1, lddv2, r_gl, c_gl, f_gl)], -// // other_offset_v+r_gl, c_gl, f_gl, lddv1, lddv2); -// // } -// if (r_sm == 0) { -// if (rest_r > R * 2) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm, f_sm)] = -// dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl, f_gl)]; -// } -// } -// if (c_sm == 0) { -// if (rest_c > C * 2) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm_ex, f_sm)] = -// dv[get_idx(lddv1, lddv2, r_gl, c_gl_ex, f_gl)]; -// } -// } -// if (f_sm == 0) { -// if (rest_f > F * 2) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm_ex)] = -// dv[get_idx(lddv1, lddv2, r_gl, c_gl, f_gl_ex)]; -// } -// } -// if (c_sm == 0 && f_sm == 0) { -// if (rest_c > C * 2 && rest_f > F * 2) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm_ex, f_sm_ex)] = -// dv[get_idx(lddv1, lddv2, r_gl, c_gl_ex, f_gl_ex)]; -// } -// } -// if (r_sm == 0 && f_sm == 0) { -// if (rest_r > R * 2 && rest_f > F * 2) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm, f_sm_ex)] = -// dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl, f_gl_ex)]; -// } -// } -// if (r_sm == 0 && c_sm == 0) { -// if (rest_r > R * 2 && rest_c > C * 2) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm_ex, f_sm)] = -// dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl)]; -// } -// } -// if (r_sm == 0 && c_sm == 0 && f_sm == 0) { -// if (rest_r > R * 2 && rest_c > C * 2 && rest_f > F * 2) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm_ex, f_sm_ex)] = -// dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl_ex)]; -// } -// } -// } - -// __syncthreads(); - -// // apply padding is necessary -// if (r_sm < rest_r && c_sm < rest_c && f_sm < rest_f) { - -// // printf("load main[%d %d %d]:%f --> [%d %d %d] (%d %d %d)\n", r_gl, -// // c_gl, f_gl, -// // dv[get_idx(lddv1, lddv2, r_gl, c_gl, f_gl)], r_sm, c_sm, f_sm, nr, -// // nc, nf); - -// // asm volatile("membar.cta;"); -// // start = clock64() - start; -// // printf("[load main] block id %d,%d,%d elapsed %lu\n", blockIdx.z, -// // blockIdx.y, blockIdx.x, start); start = clock64(); - -// // load extra surface - -// if (r_sm == 0) { -// if (rest_r > R * 2) { -// // v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm, f_sm)] = -// // dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl, f_gl)]; -// // printf("load-r[%d %d %d]:%f --> [%d %d %d]\n", r_gl_ex, c_gl, -// f_gl, -// // dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl, f_gl)], r_sm_ex, c_sm, -// // f_sm); -// } else if (nr % 2 == 0) { -// // if (r == 16 && c == 0 && f == 0) { -// // printf("padding (%d %d %d) %f <- (%f %f %f)\n", rest_r_p - 1, -// // c_sm, f_sm, -// // v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, c_sm, f_sm)], -// rest_r -// // - 1, c_sm, f_sm); -// // padded = true; -// // aa = v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, c_sm, f_sm)]; -// // bb = v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm, f_sm)]; -// // } -// v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm, f_sm)] = -// v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, c_sm, f_sm)]; -// } -// } - -// if (c_sm == 0) { -// if (rest_c > C * 2) { -// // v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm_ex, f_sm)] = -// // dv[get_idx(lddv1, lddv2, r_gl, c_gl_ex, f_gl)]; -// // printf("load-c[%d %d %d]:%f --> [%d %d %d]\n", r_gl, c_gl_ex, -// f_gl, -// // dv[get_idx(lddv1, lddv2, r_gl, c_gl_ex, f_gl)], r_sm, c_sm_ex, -// // f_sm); -// } else if (nc % 2 == 0) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, rest_c_p - 1, f_sm)] = -// v_sm[get_idx(ldsm1, ldsm2, r_sm, rest_c - 1, f_sm)]; -// } -// } - -// if (f_sm == 0) { -// if (rest_f > F * 2) { -// // v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm_ex)] = -// // dv[get_idx(lddv1, lddv2, r_gl, c_gl, f_gl_ex)]; -// // printf("load-f[%d %d %d]:%f --> [%d %d %d]\n", r_gl, c_gl, -// f_gl_ex, -// // dv[get_idx(lddv1, lddv2, r_gl, c_gl, f_gl_ex)], r_sm, c_sm, -// // f_sm_ex); -// } else if (nf % 2 == 0) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, rest_f_p - 1)] = -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, rest_f - 1)]; -// } -// } - -// // load extra edges -// if (c_sm == 0 && f_sm == 0) { -// if (rest_c > C * 2 && rest_f > F * 2) { -// // v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm_ex, f_sm_ex)] = -// // dv[get_idx(lddv1, lddv2, r_gl, c_gl_ex, f_gl_ex)]; -// // printf("load-cf[%d %d %d]:%f --> [%d %d %d]\n", r_gl, c_gl_ex, -// // f_gl_ex, dv[get_idx(lddv1, lddv2, r_gl, c_gl_ex, f_gl_ex)], r_sm, -// // c_sm_ex, f_sm_ex); -// } else if (rest_c <= C * 2 && rest_f <= F * 2 && nc % 2 == 0 && -// nf % 2 == 0) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, rest_c_p - 1, rest_f_p - 1)] = -// v_sm[get_idx(ldsm1, ldsm2, r_sm, rest_c - 1, rest_f - 1)]; -// } else if (rest_c > C * 2 && rest_f <= F * 2 && nf % 2 == 0) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm_ex, rest_f_p - 1)] = -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm_ex, rest_f - 1)]; -// } else if (rest_c <= C * 2 && rest_f > F * 2 && nc % 2 == 0) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, rest_c_p - 1, f_sm_ex)] = -// v_sm[get_idx(ldsm1, ldsm2, r_sm, rest_c - 1, f_sm_ex)]; -// } -// } - -// if (r_sm == 0 && f_sm == 0) { -// if (rest_r > R * 2 && rest_f > F * 2) { -// // v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm, f_sm_ex)] = -// // dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl, f_gl_ex)]; -// // printf("load-rf[%d %d %d]:%f --> [%d %d %d]\n", r_gl_ex, c_gl, -// // f_gl_ex, dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl, f_gl_ex)], -// // r_sm_ex, c_sm, f_sm_ex); -// } else if (rest_r <= R * 2 && rest_f <= F * 2 && nr % 2 == 0 && -// nf % 2 == 0) { -// // printf("padding (%d %d %d) <- (%d %d %d)\n", rest_r_p - 1, c_sm, -// // rest_f_p - 1, rest_r - 1, c_sm, rest_f - 1); -// v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm, rest_f_p - 1)] = -// v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, c_sm, rest_f - 1)]; -// } else if (rest_r > R * 2 && rest_f <= F * 2 && nf % 2 == 0) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm, rest_f_p - 1)] = -// v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm, rest_f - 1)]; -// } else if (rest_r <= R * 2 && rest_f > F * 2 && nr % 2 == 0) { -// v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm, f_sm_ex)] = -// v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, c_sm, f_sm_ex)]; -// } -// } - -// if (r_sm == 0 && c_sm == 0) { -// if (rest_r > R * 2 && rest_c > C * 2) { -// // v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm_ex, f_sm)] = -// // dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl)]; -// // printf("load-rc[%d %d %d]:%f --> [%d %d %d]\n", r_gl_ex, c_gl_ex, -// // f_gl, dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl)], r_sm_ex, -// // c_sm_ex, f_sm); -// } else if (rest_r <= R * 2 && rest_c <= C * 2 && nr % 2 == 0 && -// nc % 2 == 0) { -// v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, rest_c_p - 1, f_sm)] = -// v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, rest_c - 1, f_sm)]; -// // printf("padding (%d %d %d) <- (%d %d %d): %f\n", rest_r_p - 1, -// // rest_c_p - 1, f_sm, rest_r - 1, rest_c - 1, f_sm, -// // v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, rest_c - 1, f_sm)]); -// } else if (rest_r > R * 2 && rest_c <= C * 2 && nc % 2 == 0) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, rest_c_p - 1, f_sm)] = -// v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, rest_c - 1, f_sm)]; -// } else if (rest_r <= R * 2 && rest_c > C * 2 && nr % 2 == 0) { -// v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm_ex, f_sm)] = -// v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, c_sm_ex, f_sm)]; -// } -// } -// // load extra vertex - -// if (r_sm == 0 && c_sm == 0 && f_sm == 0) { -// if (rest_r > R * 2 && rest_c > C * 2 && rest_f > F * 2) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm_ex, f_sm_ex)] = -// dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl_ex)]; -// // printf("load-rcf[%d %d %d]:%f --> [%d %d %d]\n", r_gl_ex, c_gl_ex, -// // f_gl_ex, dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl_ex)], -// // r_sm_ex, c_sm_ex, f_sm_ex); -// } else if (rest_r <= R * 2 && rest_c <= C * 2 && rest_f <= F * 2 && -// nr % 2 == 0 && nc % 2 == 0 && nf % 2 == 0) { -// v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, rest_c_p - 1, -// rest_f_p - 1)] = -// v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, rest_c - 1, rest_f - 1)]; -// } else if (rest_r > R * 2 && rest_c > C * 2 && rest_f <= F * 2 && -// nf % 2 == 0) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm_ex, rest_f_p - 1)] = -// v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm_ex, rest_f - 1)]; -// } else if (rest_r > R * 2 && rest_c <= C * 2 && rest_f > F * 2 && -// nc % 2 == 0) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, rest_c_p - 1, f_sm_ex)] = -// v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, rest_c - 1, f_sm_ex)]; -// } else if (rest_r > R * 2 && rest_c <= C * 2 && rest_f <= F * 2 && -// nc % 2 == 0 && nf % 2 == 0) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, rest_c_p - 1, rest_f_p - 1)] = -// v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, rest_c - 1, rest_f - 1)]; -// } else if (rest_r <= R * 2 && rest_c > C * 2 && rest_f > F * 2 && -// nr % 2 == 0) { -// v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm_ex, f_sm_ex)] = -// v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, c_sm_ex, f_sm_ex)]; -// } else if (rest_r <= R * 2 && rest_c > C * 2 && rest_f <= F * 2 && -// nr % 2 == 0 && nf % 2 == 0) { -// v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm_ex, rest_f_p - 1)] = -// v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, c_sm_ex, rest_f - 1)]; -// } else if (rest_r <= R * 2 && rest_c <= C * 2 && rest_f > F * 2 && -// nr % 2 == 0 && nc % 2 == 0) { -// v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, rest_c_p - 1, f_sm_ex)] = -// v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, rest_c - 1, f_sm_ex)]; -// } -// } - -// // asm volatile("membar.cta;"); -// // start = clock64() - start; -// // printf("[load extra] block id %d,%d,%d elapsed %lu\n", blockIdx.z, -// // blockIdx.y, blockIdx.x, start); start = clock64(); - -// // load dist -// if (c_sm == 0 && f_sm == 0 && r_sm < rest_r_p - 2) { -// // printf("%d/%d load %f\n", r_sm, rest_r - 2, dratio_r[r + r_sm]); -// ratio_r_sm[r_sm] = dratio_r[r + r_sm]; -// // if (nr % 2 == 0 && R * 2 + 1 >= rest_r_p && r_sm == 0) { -// // ratio_r_sm[rest_r_p - 3] = 0.5; -// // } -// } -// if (r_sm == 0 && f_sm == 0 && c_sm < rest_c_p - 2) { -// ratio_c_sm[c_sm] = dratio_c[c + c_sm]; -// // if (nc % 2 == 0 && C * 2 + 1 >= rest_c_p && c_sm == 0) { -// // ratio_c_sm[rest_c_p - 3] = 0.5; -// // } -// } -// if (c_sm == 0 && r_sm == 0 && f_sm < rest_f_p - 2) { -// ratio_f_sm[f_sm] = dratio_f[f + f_sm]; -// // if (nf % 2 == 0 && F * 2 + 1 >= rest_f_p && f_sm == 0) { -// // ratio_f_sm[rest_f_p - 3] = 0.5; -// // } -// } - -// // if (r == 0 && c == 0 && f == 0 && r_sm == 0 && c_sm == 0 && f_sm == 0) -// // { -// // printf("ratio:"); -// // for (int i = 0; i < R * 2 + 1; i++) { -// // printf("%2.2f ", ratio_r_sm[i]); -// // } -// // printf("\n"); -// // } - -// } // restrict boundary - -// // asm volatile("membar.cta;"); -// // start = clock64() - start; -// // printf("[load ratio] block id %d,%d,%d elapsed %lu\n", blockIdx.z, -// // blockIdx.y, blockIdx.x, start); start = clock64(); - -// // __syncthreads(); -// // // debug print -// // if (debug) { -// // printf("in config: %d %d %d (%d %d %d)\n", R, C, F, r,c,f); -// // printf("rest_p: %d %d %d\n", rest_r_p, rest_c_p, rest_f_p); -// // bool print = false; -// // for (int i = 0; i < R * 2 + 1; i++) { -// // for (int j = 0; j < C * 2 + 1; j++) { -// // for (int k = 0; k < F * 2 + 1; k++) { -// // // if (abs(v_sm[get_idx(ldsm1, ldsm2, i, j, k)]) > 10000) { -// // // print = true; -// // // printf("(block %d %d %d) %2.2f \n", r,c,f, -// // v_sm[get_idx(ldsm1, ldsm2, i, j, k)]); -// // // printf("%2.2f ", v_sm[get_idx(ldsm1, ldsm2, i, j, k)]); -// // // } -// // printf("%2.2f ", v_sm[get_idx(ldsm1, ldsm2, i, j, k)]); -// // } -// // printf("\n"); -// // } -// // printf("\n"); -// // } -// // } -// __syncthreads(); - -// if (dw && threadId < R * C * F) { -// r_sm = (threadId / (C * F)) * 2; -// c_sm = ((threadId % (C * F)) / F) * 2; -// f_sm = ((threadId % (C * F)) % F) * 2; -// r_gl = r / 2 + threadId / (C * F); -// c_gl = c / 2 + threadId % (C * F) / F; -// f_gl = f / 2 + threadId % (C * F) % F; -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)]; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) { -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] = res; -// // printf("w-store: %d+(%d %d %d) <- %f (%d %d %d)\n", other_offset_w, -// // r_gl, c_gl, f_gl, dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)], -// // r_sm, c_sm, f_sm); -// } -// } - -// // asm volatile("membar.cta;"); -// // start = clock64() - start; -// // printf("[store coarse] block id %d,%d,%d elapsed %lu\n", blockIdx.z, -// // blockIdx.y, blockIdx.x, start); start = clock64(); -// int base = 0; -// // printf("TYPE =%d \n", TYPE); -// // printf("%d == %d && %llu >= %d && %llu < %d\n", r + R * 2, nr_p - 1, -// // threadId, base, threadId, base + C * F); - -// if (dw && r + R * 2 == nr_p - 1 && threadId >= base && -// threadId < base + C * F) { -// r_sm = R * 2; -// c_sm = ((threadId - base) / F) * 2; -// f_sm = ((threadId - base) % F) * 2; -// r_gl = r / 2 + R; -// c_gl = c / 2 + (threadId - base) / F; -// f_gl = f / 2 + (threadId - base) % F; -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)]; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) { -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] = res; -// // printf("w-store: %d+(%d %d %d) <- %f (%d %d %d)\n", other_offset_w, -// // r_gl, c_gl, f_gl, dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)], -// // r_sm, c_sm, f_sm); -// } -// } - -// base += C * F; // ROUND_UP_WARP(C * F) * WARP_SIZE; -// if (dw && c + C * 2 == nc_p - 1 && threadId >= base && -// threadId < base + R * F) { -// r_sm = ((threadId - base) / F) * 2; -// c_sm = C * 2; -// f_sm = ((threadId - base) % F) * 2; -// r_gl = r / 2 + (threadId - base) / F; -// c_gl = c / 2 + C; -// f_gl = f / 2 + (threadId - base) % F; -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)]; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) { -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] = res; -// // printf("w-store: %d+(%d %d %d) <- %f (%d %d %d)\n", other_offset_w, -// // r_gl, c_gl, f_gl, dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)], -// // r_sm, c_sm, f_sm); -// } -// // printf("(%d %d %d) (%d %d %d) %f\n", -// // r_sm, c_sm, f_sm, r_gl, c_gl, f_gl, dwork[get_idx(lddv1, -// lddv2, -// // r_gl, c_gl, f_gl)]); -// } - -// base += R * F; // ROUND_UP_WARP(R * F) * WARP_SIZE; -// // printf("%d %d\n", base, threadId); -// if (dw && f + F * 2 == nf_p - 1 && threadId >= base && -// threadId < base + R * C) { -// r_sm = ((threadId - base) / C) * 2; -// c_sm = ((threadId - base) % C) * 2; -// f_sm = F * 2; -// r_gl = r / 2 + (threadId - base) / C; -// c_gl = c / 2 + (threadId - base) % C; -// f_gl = f / 2 + F; -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)]; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) { -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] = res; -// // printf("w-store: %d+(%d %d %d) <- %f (%d %d %d)\n", other_offset_w, -// // r_gl, c_gl, f_gl, dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)], -// // r_sm, c_sm, f_sm); -// } -// } - -// base += R * C; // ROUND_UP_WARP(R * C) * WARP_SIZE; -// // load extra edges -// if (dw && c + C * 2 == nc_p - 1 && f + F * 2 == nf_p - 1 && -// threadId >= base && threadId < base + R) { -// r_sm = (threadId - base) * 2; -// c_sm = C * 2; -// f_sm = F * 2; -// r_gl = r / 2 + threadId - base; -// c_gl = c / 2 + C; -// f_gl = f / 2 + F; -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)]; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) { -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] = res; -// // printf("w-store: %d+(%d %d %d) <- %f (%d %d %d)\n", other_offset_w, -// // r_gl, c_gl, f_gl, dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)], -// // r_sm, c_sm, f_sm); -// } -// } - -// base += R; // ROUND_UP_WARP(R) * WARP_SIZE; -// // if (TYPE == 2) printf("%d %d, %d, %llu, %d\n",dw == NULL, f + F * 2, -// nf_p -// // - 1, threadId, C); -// if (dw && r + R * 2 == nr_p - 1 && f + F * 2 == nf_p - 1 && -// threadId >= base && threadId < base + C) { -// r_sm = R * 2; -// c_sm = (threadId - base) * 2; -// f_sm = F * 2; -// r_gl = r / 2 + R; -// c_gl = c / 2 + threadId - base; -// f_gl = f / 2 + F; -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)]; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) { -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] = res; -// // printf("w-store: %d+(%d %d %d) <- %f (%d %d %d)\n", other_offset_w, -// // r_gl, c_gl, f_gl, dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)], -// // r_sm, c_sm, f_sm); -// } -// // printf("store[%d %d %d]: %f\n", r_sm, c_sm, f_sm, v_sm[get_idx(ldsm1, -// // ldsm2, r_sm, c_sm, f_sm)]); -// } - -// base += C; // ROUND_UP_WARP(C) * WARP_SIZE; -// if (dw && r + R * 2 == nr_p - 1 && c + C * 2 == nc_p - 1 && -// threadId >= base && threadId < base + F) { -// r_sm = R * 2; -// c_sm = C * 2; -// f_sm = (threadId - base) * 2; -// r_gl = r / 2 + R; -// c_gl = c / 2 + C; -// f_gl = f / 2 + threadId - base; -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)]; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) { -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] = res; -// // printf("w-store: %d+(%d %d %d) <- %f (%d %d %d)\n", other_offset_w, -// // r_gl, c_gl, f_gl, dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)], -// // r_sm, c_sm, f_sm); -// } -// } -// base += F; // ROUND_UP_WARP(F) * WARP_SIZE; -// // // load extra vertex -// if (dw && r + R * 2 == nr_p - 1 && c + C * 2 == nc_p - 1 && -// f + F * 2 == nf_p - 1 && threadId >= base && threadId < base + 1) { -// r_sm = R * 2; -// c_sm = C * 2; -// f_sm = F * 2; -// r_gl = r / 2 + R; -// c_gl = c / 2 + C; -// f_gl = f / 2 + F; -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)]; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) { -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] = res; -// // printf("w-store: %d+(%d %d %d) <- %f (%d %d %d)\n", other_offset_w, -// // r_gl, c_gl, f_gl, dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)], -// // r_sm, c_sm, f_sm); -// } -// } - -// // asm volatile("membar.cta;"); -// // start = clock64() - start; -// // printf("[store extra] block id %d,%d,%d elapsed %lu\n", blockIdx.z, -// // blockIdx.y, blockIdx.x, start); start = clock64(); - -// // start = clock64(); - -// if (dwf && threadId >= R * C * F && threadId < R * C * F * 2) { -// r_sm = ((threadId - R * C * F) / (C * F)) * 2; -// c_sm = (((threadId - R * C * F) % (C * F)) / F) * 2; -// f_sm = (((threadId - R * C * F) % (C * F)) % F) * 2 + 1; -// r_gl = r / 2 + (threadId - R * C * F) / (C * F); -// c_gl = c / 2 + ((threadId - R * C * F) % (C * F)) / F; -// f_gl = f / 2 + ((threadId - R * C * F) % (C * F)) % F; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf - nf_c) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm - 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm + 1)], -// ratio_f_sm[f_sm - 1]); -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res; -// dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)] = res; -// } - -// // if (nr == 70) -// // printf("f-store: (%d %d %d) <- %f (%d %d %d)\n", r_gl, -// // c_gl, f_gl, v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)], r_sm, c_sm, -// // f_sm); -// // asm volatile("membar.cta;"); -// // start = clock64() - start; -// // printf("[F-store] block id %d,%d,%d elapsed %lu\n", blockIdx.z, -// // blockIdx.y, blockIdx.x, start); start = clock64(); -// } -// // asm volatile("membar.cta;"); -// // start = clock64() - start; -// // printf("[F-store] block id %d,%d,%d elapsed %lu\n", blockIdx.z, -// // blockIdx.y, blockIdx.x, start); start = clock64(); - -// // if (r_sm % 2 == 0 && c_sm % 2 != 0 && f_sm % 2 == 0) { - -// if (dwc && threadId >= R * C * F * 2 && threadId < R * C * F * 3) { -// r_sm = ((threadId - R * C * F * 2) / (C * F)) * 2; -// c_sm = (((threadId - R * C * F * 2) % (C * F)) / F) * 2 + 1; -// f_sm = (((threadId - R * C * F * 2) % (C * F)) % F) * 2; -// r_gl = r / 2 + (threadId - R * C * F * 2) / (C * F); -// c_gl = c / 2 + ((threadId - R * C * F * 2) % (C * F)) / F; -// f_gl = f / 2 + ((threadId - R * C * F * 2) % (C * F)) % F; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf_c) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm)], -// ratio_c_sm[c_sm - 1]); -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res; -// dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)] = res; -// } -// } - -// // asm volatile("membar.cta;"); -// // start = clock64() - start; -// // printf("[C-store] block id %d,%d,%d elapsed %lu\n", blockIdx.z, -// // blockIdx.y, blockIdx.x, start); start = clock64(); - -// // if (r_sm % 2 != 0 && c_sm % 2 == 0 && f_sm % 2 == 0) { -// if (dwr && threadId >= R * C * F * 3 && threadId < R * C * F * 4) { -// r_sm = ((threadId - R * C * F * 3) / (C * F)) * 2 + 1; -// c_sm = (((threadId - R * C * F * 3) % (C * F)) / F) * 2; -// f_sm = (((threadId - R * C * F * 3) % (C * F)) % F) * 2; -// r_gl = r / 2 + (threadId - R * C * F * 3) / (C * F); -// c_gl = c / 2 + ((threadId - R * C * F * 3) % (C * F)) / F; -// f_gl = f / 2 + ((threadId - R * C * F * 3) % (C * F)) % F; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf_c) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm)], -// ratio_r_sm[r_sm - 1]); -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res; -// dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)] = res; -// } -// } - -// // asm volatile("membar.cta;"); -// // start = clock64() - start; -// // printf("[R-store] block id %d,%d,%d elapsed %lu\n", blockIdx.z, -// // blockIdx.y, blockIdx.x, start); start = clock64(); -// __syncthreads(); -// if (dwcf && threadId >= R * C * F * 4 && threadId < R * C * F * 5) { -// r_sm = ((threadId - R * C * F * 4) / (C * F)) * 2; -// c_sm = (((threadId - R * C * F * 4) % (C * F)) / F) * 2 + 1; -// f_sm = (((threadId - R * C * F * 4) % (C * F)) % F) * 2 + 1; -// r_gl = r / 2 + (threadId - R * C * F * 4) / (C * F); -// c_gl = c / 2 + ((threadId - R * C * F * 4) % (C * F)) / F; -// f_gl = f / 2 + ((threadId - R * C * F * 4) % (C * F)) % F; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf - nf_c) { -// T f1 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm - 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm + 1)], -// ratio_f_sm[f_sm - 1]); -// T f2 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm - 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm + 1)], -// ratio_f_sm[f_sm - 1]); -// res = lerp(f1, f2, ratio_c_sm[c_sm - 1]); -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res; -// dwcf[get_idx(lddwcf1, lddwcf2, r_gl, c_gl, f_gl)] = res; -// } -// } - -// // asm volatile("membar.cta;"); -// // start = clock64() - start; -// // printf("[CF-store] block id %d,%d,%d elapsed %lu\n", blockIdx.z, -// // blockIdx.y, blockIdx.x, start); start = clock64(); - -// if (dwrf && threadId >= R * C * F * 5 && threadId < R * C * F * 6) { -// r_sm = ((threadId - R * C * F * 5) / (C * F)) * 2 + 1; -// c_sm = (((threadId - R * C * F * 5) % (C * F)) / F) * 2; -// f_sm = (((threadId - R * C * F * 5) % (C * F)) % F) * 2 + 1; -// r_gl = r / 2 + (threadId - R * C * F * 5) / (C * F); -// c_gl = c / 2 + ((threadId - R * C * F * 5) % (C * F)) / F; -// f_gl = f / 2 + ((threadId - R * C * F * 5) % (C * F)) % F; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf - nf_c) { -// T f1 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm - 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm + 1)], -// ratio_f_sm[f_sm - 1]); -// T f2 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm - 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm + 1)], -// ratio_f_sm[f_sm - 1]); -// res = lerp(f1, f2, ratio_r_sm[r_sm - 1]); -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res; -// dwrf[get_idx(lddwrf1, lddwrf2, r_gl, c_gl, f_gl)] = res; -// } -// } - -// if (dwrc && threadId >= R * C * F * 6 && threadId < R * C * F * 7) { -// r_sm = ((threadId - R * C * F * 6) / (C * F)) * 2 + 1; -// c_sm = (((threadId - R * C * F * 6) % (C * F)) / F) * 2 + 1; -// f_sm = (((threadId - R * C * F * 6) % (C * F)) % F) * 2; -// r_gl = r / 2 + (threadId - R * C * F * 6) / (C * F); -// c_gl = c / 2 + ((threadId - R * C * F * 6) % (C * F)) / F; -// f_gl = f / 2 + ((threadId - R * C * F * 6) % (C * F)) % F; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr - nr_c && c_gl < nc - nc_c && f_gl < nf_c) { -// T c1 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm - 1, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm + 1, f_sm)], -// ratio_c_sm[c_sm - 1]); -// T c2 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm - 1, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm + 1, f_sm)], -// ratio_c_sm[c_sm - 1]); -// res = lerp(c1, c2, ratio_r_sm[r_sm - 1]); -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res; -// dwrc[get_idx(lddwrc1, lddwrc2, r_gl, c_gl, f_gl)] = res; -// } -// } - -// if (dwrcf && threadId >= R * C * F * 7 && threadId < R * C * F * 8) { -// r_sm = ((threadId - R * C * F * 7) / (C * F)) * 2 + 1; -// c_sm = (((threadId - R * C * F * 7) % (C * F)) / F) * 2 + 1; -// f_sm = (((threadId - R * C * F * 7) % (C * F)) % F) * 2 + 1; -// r_gl = r / 2 + (threadId - R * C * F * 7) / (C * F); -// c_gl = c / 2 + ((threadId - R * C * F * 7) % (C * F)) / F; -// f_gl = f / 2 + ((threadId - R * C * F * 7) % (C * F)) % F; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr - nr_c && c_gl < nc - nc_c && f_gl < nf - nf_c) { -// T f1 = lerp( -// v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm - 1, f_sm - 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm - 1, f_sm + 1)], -// ratio_f_sm[f_sm - 1]); -// T f2 = lerp( -// v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm + 1, f_sm - 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm + 1, f_sm + 1)], -// ratio_f_sm[f_sm - 1]); -// T f3 = lerp( -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm - 1, f_sm - 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm - 1, f_sm + 1)], -// ratio_f_sm[f_sm - 1]); -// T f4 = lerp( -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm + 1, f_sm - 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm + 1, f_sm + 1)], -// ratio_f_sm[f_sm - 1]); - -// T fc1 = lerp(f1, f2, ratio_c_sm[c_sm - 1]); -// T fc2 = lerp(f3, f4, ratio_c_sm[c_sm - 1]); - -// res = lerp(fc1, fc2, ratio_r_sm[r_sm - 1]); -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res; -// dwrcf[get_idx(lddwrcf1, lddwrcf2, r_gl, c_gl, f_gl)] = res; -// } -// } -// // end = clock64(); - -// // asm volatile("membar.cta;"); -// // if (threadId < 256 && blockIdx.z == 0 && blockIdx.y == 0 && blockIdx.x -// == -// // 0) printf("threadId %d elapsed %lu\n", threadId, end-start); -// if (r + R * 2 == nr_p - 1) { -// // printf("test\n"); -// if (threadId < C * F) { -// // printf("test1\n"); -// if (dwf) { -// // printf("test2\n"); -// r_sm = R * 2; -// c_sm = (threadId / F) * 2; -// f_sm = (threadId % F) * 2 + 1; -// r_gl = r / 2 + R; -// c_gl = c / 2 + threadId / F; -// f_gl = f / 2 + threadId % F; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf - nf_c) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm - 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm + 1)], -// ratio_f_sm[f_sm - 1]); -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res; -// // printf("dwf (%d %d %d): %f<-(%f %f %f)\n", r_gl, c_gl, f_gl, -// res, -// // v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm - 1)], -// // v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm + -// 1)], -// // ratio_f_sm[f_sm - 1]); -// dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)] = res; -// } -// } - -// if (dwc) { -// r_sm = R * 2; -// c_sm = (threadId / F) * 2 + 1; -// f_sm = (threadId % F) * 2; -// r_gl = r / 2 + R; -// c_gl = c / 2 + threadId / F; -// f_gl = f / 2 + threadId % F; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf_c) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm)], -// ratio_c_sm[c_sm - 1]); -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res; -// dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)] = res; -// } -// } - -// if (dwcf) { -// r_sm = R * 2; -// c_sm = (threadId / F) * 2 + 1; -// f_sm = (threadId % F) * 2 + 1; -// r_gl = r / 2 + R; -// c_gl = c / 2 + threadId / F; -// f_gl = f / 2 + threadId % F; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf - nf_c) { -// T f1 = lerp( -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm - 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm + 1)], -// ratio_f_sm[f_sm - 1]); -// T f2 = lerp( -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm - 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm + 1)], -// ratio_f_sm[f_sm - 1]); -// res = lerp(f1, f2, ratio_c_sm[c_sm - 1]); -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res; -// dwcf[get_idx(lddwcf1, lddwcf2, r_gl, c_gl, f_gl)] = res; -// } -// } -// } -// } - -// if (c + C * 2 == nc_p - 1) { -// if (threadId >= R * C * F && threadId < R * C * F + R * F) { -// if (dwf) { -// r_sm = ((threadId - R * C * F) / F) * 2; -// c_sm = C * 2; -// f_sm = ((threadId - R * C * F) % F) * 2 + 1; -// r_gl = r / 2 + (threadId - R * C * F) / F; -// c_gl = c / 2 + C; -// f_gl = f / 2 + (threadId - R * C * F) % F; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf - nf_c) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm - 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm + 1)], -// ratio_f_sm[f_sm - 1]); -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res; -// dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)] = res; -// } -// } - -// if (dwr) { -// r_sm = ((threadId - R * C * F) / F) * 2 + 1; -// c_sm = C * 2; -// f_sm = ((threadId - R * C * F) % F) * 2; -// r_gl = r / 2 + (threadId - R * C * F) / F; -// c_gl = c / 2 + C; -// f_gl = f / 2 + (threadId - R * C * F) % F; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf_c) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm)], -// ratio_r_sm[r_sm - 1]); -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res; -// dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)] = res; -// } -// } - -// if (dwrf) { -// r_sm = ((threadId - R * C * F) / F) * 2 + 1; -// c_sm = C * 2; -// f_sm = ((threadId - R * C * F) % F) * 2 + 1; -// r_gl = r / 2 + (threadId - R * C * F) / F; -// c_gl = c / 2 + C; -// f_gl = f / 2 + (threadId - R * C * F) % F; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf - nf_c) { -// T f1 = lerp( -// v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm - 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm + 1)], -// ratio_f_sm[f_sm - 1]); -// T f2 = lerp( -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm - 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm + 1)], -// ratio_f_sm[f_sm - 1]); -// res = lerp(f1, f2, ratio_r_sm[r_sm - 1]); -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res; -// dwrf[get_idx(lddwrf1, lddwrf2, r_gl, c_gl, f_gl)] = res; -// } -// } -// } -// } - -// if (f + F * 2 == nf_p - 1) { -// if (threadId >= R * C * F * 2 && threadId < R * C * F * 2 + R * C) { -// if (dwc) { -// r_sm = ((threadId - R * C * F * 2) / C) * 2; -// c_sm = ((threadId - R * C * F * 2) % C) * 2 + 1; -// f_sm = F * 2; -// r_gl = r / 2 + (threadId - R * C * F * 2) / C; -// c_gl = c / 2 + (threadId - R * C * F * 2) % C; -// f_gl = f / 2 + F; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf_c) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm)], -// ratio_c_sm[c_sm - 1]); -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res; -// dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)] = res; -// } -// } - -// if (dwr) { -// r_sm = ((threadId - R * C * F * 2) / C) * 2 + 1; -// c_sm = ((threadId - R * C * F * 2) % C) * 2; -// f_sm = F * 2; -// r_gl = r / 2 + (threadId - R * C * F * 2) / C; -// c_gl = c / 2 + (threadId - R * C * F * 2) % C; -// f_gl = f / 2 + F; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf_c) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm)], -// ratio_r_sm[r_sm - 1]); -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res; -// dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)] = res; -// } -// } - -// if (dwrc) { -// r_sm = ((threadId - R * C * F * 2) / C) * 2 + 1; -// c_sm = ((threadId - R * C * F * 2) % C) * 2 + 1; -// f_sm = F * 2; -// r_gl = r / 2 + (threadId - R * C * F * 2) / C; -// c_gl = c / 2 + (threadId - R * C * F * 2) % C; -// f_gl = f / 2 + F; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr - nr_c && c_gl < nc - nc_c && f_gl < nf_c) { -// T c1 = lerp( -// v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm - 1, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm + 1, f_sm)], -// ratio_c_sm[c_sm - 1]); -// T c2 = lerp( -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm - 1, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm + 1, f_sm)], -// ratio_c_sm[c_sm - 1]); -// res = lerp(c1, c2, ratio_r_sm[r_sm - 1]); -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res; -// dwrc[get_idx(lddwrc1, lddwrc2, r_gl, c_gl, f_gl)] = res; -// } -// } -// } -// } - -// if (dwr && c + C * 2 == nc_p - 1 && f + F * 2 == nf_p - 1) { -// if (threadId >= R * C * F * 3 && threadId < R * C * F * 3 + R) { -// r_sm = (threadId - R * C * F * 3) * 2 + 1; -// c_sm = C * 2; -// f_sm = F * 2; -// r_gl = r / 2 + threadId - R * C * F * 3; -// c_gl = c / 2 + C; -// f_gl = f / 2 + F; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf_c) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm)], -// ratio_r_sm[r_sm - 1]); -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res; -// dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)] = res; -// } -// } -// } - -// if (dwc && r + R * 2 == nr_p - 1 && f + F * 2 == nf_p - 1) { -// if (threadId >= R * C * F * 4 && threadId < R * C * F * 4 + C) { -// r_sm = R * 2; -// c_sm = (threadId - R * C * F * 4) * 2 + 1; -// f_sm = F * 2; -// r_gl = r / 2 + R; -// c_gl = c / 2 + threadId - R * C * F * 4; -// f_gl = f / 2 + F; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf_c) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm)], -// ratio_c_sm[c_sm - 1]); -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res; -// dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)] = res; -// } -// } -// } - -// if (dwf && r + R * 2 == nr_p - 1 && c + C * 2 == nc_p - 1) { -// if (threadId >= R * C * F * 5 && threadId < R * C * F * 5 + F) { -// r_sm = R * 2; -// c_sm = C * 2; -// f_sm = (threadId - R * C * F * 5) * 2 + 1; -// r_gl = r / 2 + R; -// c_gl = c / 2 + C; -// f_gl = f / 2 + threadId - R * C * F * 5; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf - nf_c) { -// res = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm - 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm + 1)], -// ratio_f_sm[f_sm - 1]); -// res = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] - res; -// dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)] = res; -// // printf("dwf(%d %d %d): %f\n", r_gl, c_gl, f_gl, -// // dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)]); -// } -// } -// } - -// // if (r == 0 && c == 0 && f == 0 && threadId == 0) { -// // printf("out config: %d %d %d (%d %d %d)\n", R, C, F, r,c,f); -// // for (int i = 0; i < R * 2 + 1; i++) { -// // for (int j = 0; j < C * 2 + 1; j++) { -// // for (int k = 0; k < F * 2 + 1; k++) { -// // printf("%2.2f ", v_sm[get_idx(ldsm1, ldsm2, i, j, k)]); -// // } -// // printf("\n"); -// // } -// // printf("\n"); -// // } -// // } -// } - -// template -// MGARDX_KERL void -// _gpk_reo_3d(SIZE nr, SIZE nc, SIZE nf, -// SIZE nr_c, SIZE nc_c, SIZE nf_c, -// T *dratio_r, T *dratio_c, T *dratio_f, -// T *dv, SIZE lddv1, SIZE lddv2, -// T *dw, SIZE lddw1, SIZE lddw2, -// T *dwf, SIZE lddwf1, SIZE lddwf2, -// T *dwc, SIZE lddwc1, SIZE lddwc2, -// T *dwr, SIZE lddwr1, SIZE lddwr2, -// T *dwcf, SIZE lddwcf1, SIZE lddwcf2, -// T *dwrf, SIZE lddwrf1, SIZE lddwrf2, -// T *dwrc, SIZE lddwrc1, SIZE lddwrc2, -// T *dwrcf, SIZE lddwrcf1, SIZE lddwrcf2) { - -// __gpk_reo_3d(gridDim.z, gridDim.y, gridDim.x, blockDim.z, -// blockDim.y, blockDim.x, -// blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, threadIdx.y, -// threadIdx.x, -// nr, nc, nf, nr_c, nc_c, nf_c, -// dratio_r, dratio_c, dratio_f, -// dv, lddv1, lddv2, -// dw, lddw1, lddw2, -// dwf, lddwf1, lddwf2, -// dwc, lddwc1, lddwc2, -// dwr, lddwr1, lddwr2, -// dwcf, lddwcf1, lddwcf2, -// dwrf, lddwrf1, lddwrf2, -// dwrc, lddwrc1, lddwrc2, -// dwrcf, lddwrcf1, lddwrcf2); - -// } - -// template -// void gpk_reo_3d_adaptive_launcher( -// Handle &handle, SIZE nr, SIZE nc, SIZE nf, T *dratio_r, T -// *dratio_c, T *dratio_f, T *dv, SIZE lddv1, SIZE lddv2, T *dw, SIZE lddw1, -// SIZE lddw2, T *dwf, SIZE lddwf1, SIZE lddwf2, T *dwc, SIZE lddwc1, SIZE -// lddwc2, T *dwr, SIZE lddwr1, SIZE lddwr2, T *dwcf, SIZE lddwcf1, SIZE -// lddwcf2, T *dwrf, SIZE lddwrf1, SIZE lddwrf2, T *dwrc, SIZE lddwrc1, SIZE -// lddwrc2, T *dwrcf, SIZE lddwrcf1, SIZE lddwrcf2, int queue_idx) { - -// SIZE nr_c = nr / 2 + 1; -// SIZE nc_c = nc / 2 + 1; -// SIZE nf_c = nf / 2 + 1; -// SIZE total_thread_z = std::max(nr - 1, (SIZE)1); -// SIZE total_thread_y = std::max(nc - 1, (SIZE)1); -// SIZE total_thread_x = std::max(nf - 1, (SIZE)1); - -// SIZE tbx, tby, tbz, gridx, gridy, gridz; -// dim3 threadsPerBlock, blockPerGrid; -// size_t sm_size; -// // const int R = 4; -// // const int C = 4; -// // const int F = 16; -// // tbz = std::min(R, total_thread_z); -// // tby = std::min(C, total_thread_y); -// // tbx = std::min(F, total_thread_x); -// tbz = R; -// tby = C; -// tbx = F; -// sm_size = ((R + 1) * (C + 1) * (F + 1) + R + C + F) * sizeof(T); -// gridz = ceil((float)total_thread_z / tbz); -// gridy = ceil((float)total_thread_y / tby); -// gridx = ceil((float)total_thread_x / tbx); -// threadsPerBlock = dim3(tbx, tby, tbz); -// blockPerGrid = dim3(gridx, gridy, gridz); -// // printf("exec config (%d %d %d) (%d %d %d)\n", tbx, tby, tbz, gridx, -// gridy, -// // gridz); -// _gpk_reo_3d -// <<>>( -// nr, nc, nf, nr_c, nc_c, nf_c, dratio_r, dratio_c, dratio_f, dv, -// lddv1, lddv2, dw, lddw1, lddw2, dwf, lddwf1, lddwf2, dwc, lddwc1, -// lddwc2, dwr, lddwr1, lddwr2, dwcf, lddwcf1, lddwcf2, dwrf, lddwrf1, -// lddwrf2, dwrc, lddwrc1, lddwrc2, dwrcf, lddwrcf1, lddwrcf2); -// gpuErrchk(cudaGetLastError()); -// if (handle.sync_and_check_all_kernels) { -// gpuErrchk(cudaDeviceSynchronize()); -// } -// } - -// template -// void gpk_reo_3d(Handle &handle, SIZE nr, SIZE nc, SIZE nf, T *dratio_r, -// T *dratio_c, T *dratio_f, T *dv, SIZE lddv1, SIZE lddv2, T -// *dw, SIZE lddw1, SIZE lddw2, T *dwf, SIZE lddwf1, SIZE -// lddwf2, T *dwc, SIZE lddwc1, SIZE lddwc2, T *dwr, SIZE -// lddwr1, SIZE lddwr2, T *dwcf, SIZE lddwcf1, SIZE lddwcf2, T -// *dwrf, SIZE lddwrf1, SIZE lddwrf2, T *dwrc, SIZE lddwrc1, -// SIZE lddwrc2, T *dwrcf, SIZE lddwrcf1, SIZE lddwrcf2, int -// queue_idx, int config) { - -// #define GPK(R, C, F) \ -// { \ -// gpk_reo_3d_adaptive_launcher( \ -// handle, nr, nc, nf, dratio_r, dratio_c, dratio_f, dv, lddv1, lddv2, \ -// dw, lddw1, lddw2, dwf, lddwf1, lddwf2, dwc, lddwc1, lddwc2, dwr, \ -// lddwr1, lddwr2, dwcf, lddwcf1, lddwcf2, dwrf, lddwrf1, lddwrf2, dwrc, \ -// lddwrc1, lddwrc2, dwrcf, lddwrcf1, lddwrcf2, queue_idx); \ -// } -// bool profile = false; -// if (handle.profile_kernels) { -// profile = true; -// } -// if (D == 3) { -// if (profile || config == 6) { -// GPK(2, 2, 128) -// } -// if (profile || config == 5) { -// GPK(2, 2, 64) -// } -// if (profile || config == 4) { -// GPK(4, 4, 32) -// } -// if (profile || config == 3) { -// GPK(4, 4, 16) -// } -// if (profile || config == 2) { -// GPK(4, 4, 8) -// } -// if (profile || config == 1) { -// GPK(4, 4, 4) -// } -// if (profile || config == 0) { -// GPK(2, 2, 2) -// } -// // PI_QL(T, 4, 4, 4) -// } else if (D == 2) { -// if (profile || config == 6) { -// GPK(1, 2, 128) -// } -// if (profile || config == 5) { -// GPK(1, 2, 64) -// } -// if (profile || config == 4) { -// GPK(1, 4, 32) -// } -// if (profile || config == 3) { -// GPK(1, 4, 16) -// } -// if (profile || config == 2) { -// GPK(1, 4, 8) -// } -// if (profile || config == 1) { -// GPK(1, 4, 4) -// } -// if (profile || config == 0) { -// GPK(1, 2, 4) -// } -// // PI_QL(T, 1, 4, 4) -// } else if (D == 1) { -// if (profile || config == 6) { -// GPK(1, 1, 128) -// } -// if (profile || config == 5) { -// GPK(1, 1, 64) -// } -// if (profile || config == 4) { -// GPK(1, 1, 32) -// } -// if (profile || config == 3) { -// GPK(1, 1, 16) -// } -// if (profile || config == 2) { -// GPK(1, 1, 8) -// } -// if (profile || config == 1) { -// GPK(1, 1, 8) -// } -// if (profile || config == 0) { -// GPK(1, 1, 8) -// } -// } -// #undef GPK -// } - -// template -// __global__ void -// _gpk_rev_3d(SIZE nr, SIZE nc, SIZE nf, SIZE nr_c, SIZE nc_c, SIZE nf_c, T -// *dratio_r, -// T *dratio_c, T *dratio_f, T *dv, SIZE lddv1, SIZE lddv2, T *dw, -// SIZE lddw1, SIZE lddw2, T *dwf, SIZE lddwf1, SIZE lddwf2, T *dwc, -// SIZE lddwc1, SIZE lddwc2, T *dwr, SIZE lddwr1, SIZE lddwr2, T -// *dwcf, SIZE lddwcf1, SIZE lddwcf2, T *dwrf, SIZE lddwrf1, SIZE -// lddwrf2, T *dwrc, SIZE lddwrc1, SIZE lddwrc2, T *dwrcf, SIZE -// lddwrcf1, SIZE lddwrcf2, SIZE svr, SIZE svc, SIZE svf, SIZE nvr, -// SIZE nvc, SIZE nvf) { - -// //to be removed -// // int TYPE = 1; -// // bool INTERPOLATION = true; -// // bool COEFF_RESTORE = true; -// // int in_next = false; -// // int skip = false; - -// SIZE r = blockIdx.z * blockDim.z; -// SIZE c = blockIdx.y * blockDim.y; -// SIZE f = blockIdx.x * blockDim.x; - -// SIZE r_sm = threadIdx.z; -// SIZE c_sm = threadIdx.y; -// SIZE f_sm = threadIdx.x; - -// SIZE r_sm_ex = R * 2; -// SIZE c_sm_ex = C * 2; -// SIZE f_sm_ex = F * 2; - -// SIZE r_gl; -// SIZE c_gl; -// SIZE f_gl; - -// SIZE r_gl_ex; -// SIZE c_gl_ex; -// SIZE f_gl_ex; - -// T res; - -// LENGTH threadId; - -// T *sm = SharedMemory(); - -// // extern __shared__ double sm[]; // size: (blockDim.x + 1) * (blockDim.y + -// 1) -// // * (blockDim.z + 1) -// SIZE ldsm1 = F * 2 + 1; -// SIZE ldsm2 = C * 2 + 1; -// T *v_sm = sm; -// T *ratio_f_sm = sm + (F * 2 + 1) * (C * 2 + 1) * (R * 2 + 1); -// T *ratio_c_sm = ratio_f_sm + F * 2; -// T *ratio_r_sm = ratio_c_sm + C * 2; - -// SIZE rest_r = nr - r; -// SIZE rest_c = nc - c; -// SIZE rest_f = nf - f; - -// SIZE nr_p = nr; -// SIZE nc_p = nc; -// SIZE nf_p = nf; - -// SIZE rest_r_p; -// SIZE rest_c_p; -// SIZE rest_f_p; - -// rest_r_p = rest_r; -// rest_c_p = rest_c; -// rest_f_p = rest_f; - -// threadId = (threadIdx.z * (blockDim.x * blockDim.y)) + -// (threadIdx.y * blockDim.x) + threadIdx.x; - -// // extern __shared__ __align__(sizeof(T)) unsigned char smem[]; -// // T * sm = reinterpret_cast(smem); - -// // // load dist -// // if (c_sm == 0 && f_sm == 0 && r + r_sm < nr) { -// // ratio_r_sm[r_sm] = dratio_r[r + r_sm]; -// // } -// // if (r_sm == 0 && f_sm == 0 && c + c_sm < nc) { -// // ratio_c_sm[c_sm] = dratio_c[c + c_sm]; -// // } -// // if (c_sm == 0 && r_sm == 0 && f + f_sm < nf) { -// // ratio_f_sm[f_sm] = dratio_f[f + f_sm]; -// // } - -// if (nr % 2 == 0) { -// nr_p = nr + 1; -// rest_r_p = nr_p - r; -// } -// if (nc % 2 == 0) { -// nc_p = nc + 1; -// rest_c_p = nc_p - c; -// } -// if (nf % 2 == 0) { -// nf_p = nf + 1; -// rest_f_p = nf_p - f; -// } - -// // load dist -// if (c_sm == 0 && f_sm == 0 && r_sm < rest_r - 2) { -// ratio_r_sm[r_sm] = dratio_r[r + r_sm]; -// if (nr % 2 == 0 && R * 2 + 1 >= rest_r_p && r_sm == 0) { -// ratio_r_sm[rest_r_p - 3] = 0.5; -// } -// } -// if (r_sm == 0 && f_sm == 0 && c_sm < rest_c - 2) { -// ratio_c_sm[c_sm] = dratio_c[c + c_sm]; -// if (nc % 2 == 0 && C * 2 + 1 >= rest_c_p && c_sm == 0) { -// ratio_c_sm[rest_c_p - 3] = 0.5; -// } -// } -// if (c_sm == 0 && r_sm == 0 && f_sm < rest_f - 2) { -// ratio_f_sm[f_sm] = dratio_f[f + f_sm]; -// if (nf % 2 == 0 && F * 2 + 1 >= rest_f_p && f_sm == 0) { -// ratio_f_sm[rest_f_p - 3] = 0.5; -// } -// } - -// // if (threadIdx.z == 0 && threadIdx.y == 0 && threadIdx.x == 0) { -// // for (int i = 0; i < R * 2 + 1; i++) { -// // for (int j = 0; j < C * 2 + 1; j++) { -// // for (int k = 0; k < F * 2 + 1; k++) { -// // v_sm[get_idx(ldsm1, ldsm2, i, j, k)] = 71177117; -// // } -// // } -// // } -// // } - -// __syncthreads(); - -// if (dw && threadId < R * C * F) { -// r_sm = (threadId / (C * F)) * 2; -// c_sm = ((threadId % (C * F)) / F) * 2; -// f_sm = ((threadId % (C * F)) % F) * 2; -// r_gl = r / 2 + threadId / (C * F); -// c_gl = c / 2 + threadId % (C * F) / F; -// f_gl = f / 2 + threadId % (C * F) % F; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)]; -// // if (c_gl == nc_c - 1 && f_gl == nf_c-1) -// // printf("block: (%d %d %d) thread: (%d %d %d) load0 (%d %d %d): %f -// // (%d %d %d)\n", blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, -// // threadIdx.y, threadIdx.x, r_sm, c_sm, f_sm, -// // v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)], -// // r_gl, c_gl, f_gl); -// } - -// } - -// int base = 0; -// if (dw && threadId >= base && threadId < base + C * F) { -// r_sm = R * 2; -// c_sm = ((threadId - base) / F) * 2; -// f_sm = ((threadId - base) % F) * 2; -// r_gl = r / 2 + R; -// c_gl = c / 2 + (threadId - base) / F; -// f_gl = f / 2 + (threadId - base) % F; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)]; -// // if (c_gl == nc_c - 1 && f_gl == nf_c-1) -// // printf("block: (%d %d %d) thread: (%d %d %d) load1 (%d %d %d): %f -// // (%d %d %d)\n", blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, -// // threadIdx.y, threadIdx.x, r_sm, c_sm, f_sm, -// // v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)], -// // r_gl, c_gl, f_gl); -// } -// } -// base += C * F; // ROUND_UP_WARP(C * F) * WARP_SIZE; -// if (dw && threadId >= base && threadId < base + R * F) { -// r_sm = ((threadId - base) / F) * 2; -// c_sm = C * 2; -// f_sm = ((threadId - base) % F) * 2; -// r_gl = r / 2 + (threadId - base) / F; -// c_gl = c / 2 + C; -// f_gl = f / 2 + (threadId - base) % F; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)]; -// // if (c_gl == nc_c - 1 && f_gl == nf_c-1) -// // printf("block: (%d %d %d) thread: (%d %d %d) load2 (%d %d %d): %f -// // (%d %d %d)\n", blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, -// // threadIdx.y, threadIdx.x, r_sm, c_sm, f_sm, -// // v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)], -// // r_gl, c_gl, f_gl); -// } -// } -// base += R * F; // ROUND_UP_WARP(R * F) * WARP_SIZE; -// if (dw && threadId >= base && threadId < base + R * C) { -// r_sm = ((threadId - base) / C) * 2; -// c_sm = ((threadId - base) % C) * 2; -// f_sm = F * 2; -// r_gl = r / 2 + (threadId - base) / C; -// c_gl = c / 2 + (threadId - base) % C; -// f_gl = f / 2 + F; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)]; -// // if (c_gl == nc_c - 1 && f_gl == nf_c-1) -// // printf("block: (%d %d %d) thread: (%d %d %d) load3 (%d %d %d): %f -// // (%d %d %d)\n", blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, -// // threadIdx.y, threadIdx.x, r_sm, c_sm, f_sm, -// // v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)], -// // r_gl, c_gl, f_gl); -// } -// } -// base += R * C; // ROUND_UP_WARP(R * C) * WARP_SIZE; -// // load extra edges -// if (dw && threadId >= base && threadId < base + R) { -// r_sm = (threadId - base) * 2; -// c_sm = C * 2; -// f_sm = F * 2; -// r_gl = r / 2 + threadId - base; -// c_gl = c / 2 + C; -// f_gl = f / 2 + F; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)]; -// // if (c_gl == nc_c - 1 && f_gl == nf_c-1) -// // printf("block: (%d %d %d) thread: (%d %d %d) load4 (%d %d %d): %f -// // (%d %d %d)\n", blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, -// // threadIdx.y, threadIdx.x, r_sm, c_sm, f_sm, -// // v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)], -// // r_gl, c_gl, f_gl); -// } -// } -// base += R; // ROUND_UP_WARP(R) * WARP_SIZE; -// if (dw && threadId >= base && threadId < base + C) { -// r_sm = R * 2; -// c_sm = (threadId - base) * 2; -// f_sm = F * 2; -// r_gl = r / 2 + R; -// c_gl = c / 2 + threadId - base; -// f_gl = f / 2 + F; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)]; -// // if (c_gl == nc_c - 1 && f_gl == nf_c-1) -// // printf("block: (%d %d %d) thread: (%d %d %d) load5 (%d %d %d): %f -// // (%d %d %d)\n", blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, -// // threadIdx.y, threadIdx.x, r_sm, c_sm, f_sm, -// // v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)], -// // r_gl, c_gl, f_gl); -// } -// } -// base += C; // ROUND_UP_WARP(C) * WARP_SIZE; -// if (dw && threadId >= base && threadId < base + F) { -// r_sm = R * 2; -// c_sm = C * 2; -// f_sm = (threadId - base) * 2; -// r_gl = r / 2 + R; -// c_gl = c / 2 + C; -// f_gl = f / 2 + threadId - base; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)]; -// // if (c_gl == nc_c - 1 && f_gl == nf_c-1) -// // printf("block: (%d %d %d) thread: (%d %d %d) load6 (%d %d %d): %f -// // (%d %d %d)\n", blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, -// // threadIdx.y, threadIdx.x, r_sm, c_sm, f_sm, -// // v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)], -// // r_gl, c_gl, f_gl); -// } -// } -// base += F; // ROUND_UP_WARP(F) * WARP_SIZE; -// // // load extra vertex -// if (dw && threadId >= base && threadId < base + 1) { -// r_sm = R * 2; -// c_sm = C * 2; -// f_sm = F * 2; -// r_gl = r / 2 + R; -// c_gl = c / 2 + C; -// f_gl = f / 2 + F; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)]; -// // if (c_gl == nc_c - 1 && f_gl == nf_c-1) -// // printf("block: (%d %d %d) thread: (%d %d %d) load7 (%d %d %d): %f -// // (%d %d %d)\n", blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, -// // threadIdx.y, threadIdx.x, r_sm, c_sm, f_sm, -// // v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)], -// // r_gl, c_gl, f_gl); -// } -// } - -// __syncthreads(); - -// // __syncthreads(); -// // if (threadIdx.x == 0 && threadIdx.y == 0&& threadIdx.z == 0) { -// // printf("rest_p: %u %u %u RCF\n", rest_r_p, rest_c_p, rest_f_p, R, C, -// F); -// // for (int i = 0; i < min(rest_r_p, R * 2 + 1); i++) { -// // for (int j = 0; j < min(rest_c_p, C * 2 + 1); j++) { -// // for (int k = 0; k < min(rest_f_p, F * 2 + 1); k++) { -// // printf("%2.2f ", v_sm[get_idx(ldsm1, ldsm2, i, j, k)]); -// // } -// // printf("\n"); -// // } -// // printf("\n"); -// // } -// // } -// // __syncthreads(); - -// // __syncthreads(); -// // if (debug) { -// // printf("TYPE: %d %d %d %d\n", TYPE, -// // min(rest_r_p, R * 2 + 1), -// // min(rest_c_p, C * 2 + 1), -// // min(rest_f_p, F * 2 + 1)); -// // for (int i = 0; i < min(rest_r_p, R * 2 + 1); i++) { -// // for (int j = 0; j < min(rest_c_p, C * 2 + 1); j++) { -// // for (int k = 0; k < min(rest_f_p, F * 2 + 1); k++) { -// // printf("%2.2f ", v_sm[get_idx(ldsm1, ldsm2, i, j, k)]); -// // } -// // printf("\n"); -// // } -// // printf("\n"); -// // } -// // } -// // __syncthreads(); - -// if (dwf && threadId >= R * C * F && threadId < R * C * F * 2) { - -// r_sm = ((threadId - R * C * F) / (C * F)) * 2; -// c_sm = (((threadId - R * C * F) % (C * F)) / F) * 2; -// f_sm = (((threadId - R * C * F) % (C * F)) % F) * 2 + 1; -// r_gl = r / 2 + (threadId - R * C * F) / (C * F); -// c_gl = c / 2 + ((threadId - R * C * F) % (C * F)) / F; -// f_gl = f / 2 + ((threadId - R * C * F) % (C * F)) % F; - -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf - nf_c) { - -// res = dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)]; -// res += lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm - 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm + 1)], -// ratio_f_sm[f_sm - 1]); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res; -// } - -// } - -// if (dwc && threadId >= R * C * F * 2 && threadId < R * C * F * 3) { -// r_sm = ((threadId - R * C * F * 2) / (C * F)) * 2; -// c_sm = (((threadId - R * C * F * 2) % (C * F)) / F) * 2 + 1; -// f_sm = (((threadId - R * C * F * 2) % (C * F)) % F) * 2; -// r_gl = r / 2 + (threadId - R * C * F * 2) / (C * F); -// c_gl = c / 2 + ((threadId - R * C * F * 2) % (C * F)) / F; -// f_gl = f / 2 + ((threadId - R * C * F * 2) % (C * F)) % F; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf_c) { -// res = dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)]; -// res += lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm)], -// ratio_c_sm[c_sm - 1]); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res; -// } -// } - -// if (dwr && threadId >= R * C * F * 3 && threadId < R * C * F * 4) { -// r_sm = ((threadId - R * C * F * 3) / (C * F)) * 2 + 1; -// c_sm = (((threadId - R * C * F * 3) % (C * F)) / F) * 2; -// f_sm = (((threadId - R * C * F * 3) % (C * F)) % F) * 2; -// r_gl = r / 2 + (threadId - R * C * F * 3) / (C * F); -// c_gl = c / 2 + ((threadId - R * C * F * 3) % (C * F)) / F; -// f_gl = f / 2 + ((threadId - R * C * F * 3) % (C * F)) % F; - -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf_c) { -// res = dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)]; -// res += lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm)], -// ratio_r_sm[r_sm - 1]); -// // if (c_gl == nc_c-1 && f_gl == nf_c - 1) -// // printf("block: (%d %d %d) thread: (%d %d %d) calc_coeff0 (%d -// // %d %d): %f <- %f %f\n", blockIdx.z, blockIdx.y, blockIdx.x, -// // threadIdx.z, threadIdx.y, threadIdx.x, r_sm, c_sm, f_sm, -// // res, v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, -// // f_sm)], -// // v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm)]); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res; -// } -// } - -// if (dwcf && threadId >= R * C * F * 4 && threadId < R * C * F * 5) { -// r_sm = ((threadId - R * C * F * 4) / (C * F)) * 2; -// c_sm = (((threadId - R * C * F * 4) % (C * F)) / F) * 2 + 1; -// f_sm = (((threadId - R * C * F * 4) % (C * F)) % F) * 2 + 1; -// r_gl = r / 2 + (threadId - R * C * F * 4) / (C * F); -// c_gl = c / 2 + ((threadId - R * C * F * 4) % (C * F)) / F; -// f_gl = f / 2 + ((threadId - R * C * F * 4) % (C * F)) % F; - -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf - nf_c) { -// res = dwcf[get_idx(lddwcf1, lddwcf2, r_gl, c_gl, f_gl)]; -// T f1 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm - 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm + 1)], -// ratio_f_sm[f_sm - 1]); -// T f2 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm - 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm + 1)], -// ratio_f_sm[f_sm - 1]); -// res += lerp(f1, f2, ratio_c_sm[c_sm - 1]); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res; -// } -// } - -// if (dwrf && threadId >= R * C * F * 5 && threadId < R * C * F * 6) { -// r_sm = ((threadId - R * C * F * 5) / (C * F)) * 2 + 1; -// c_sm = (((threadId - R * C * F * 5) % (C * F)) / F) * 2; -// f_sm = (((threadId - R * C * F * 5) % (C * F)) % F) * 2 + 1; -// r_gl = r / 2 + (threadId - R * C * F * 5) / (C * F); -// c_gl = c / 2 + ((threadId - R * C * F * 5) % (C * F)) / F; -// f_gl = f / 2 + ((threadId - R * C * F * 5) % (C * F)) % F; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf - nf_c) { -// res = dwrf[get_idx(lddwrf1, lddwrf2, r_gl, c_gl, f_gl)]; -// T f1 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm - 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm + 1)], -// ratio_f_sm[f_sm - 1]); -// T f2 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm - 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm + 1)], -// ratio_f_sm[f_sm - 1]); -// res += lerp(f1, f2, ratio_r_sm[r_sm - 1]); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res; -// } -// } - -// if (dwrc && threadId >= R * C * F * 6 && threadId < R * C * F * 7) { -// r_sm = ((threadId - R * C * F * 6) / (C * F)) * 2 + 1; -// c_sm = (((threadId - R * C * F * 6) % (C * F)) / F) * 2 + 1; -// f_sm = (((threadId - R * C * F * 6) % (C * F)) % F) * 2; -// r_gl = r / 2 + (threadId - R * C * F * 6) / (C * F); -// c_gl = c / 2 + ((threadId - R * C * F * 6) % (C * F)) / F; -// f_gl = f / 2 + ((threadId - R * C * F * 6) % (C * F)) % F; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr - nr_c && c_gl < nc - nc_c && f_gl < nf_c) { -// res = dwrc[get_idx(lddwrc1, lddwrc2, r_gl, c_gl, f_gl)]; -// T c1 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm - 1, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm + 1, f_sm)], -// ratio_c_sm[c_sm - 1]); -// T c2 = lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm - 1, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm + 1, f_sm)], -// ratio_c_sm[c_sm - 1]); -// res += lerp(c1, c2, ratio_r_sm[r_sm - 1]); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res; -// } -// } - -// if (dwrcf && threadId >= R * C * F * 7 && threadId < R * C * F * 8) { -// r_sm = ((threadId - R * C * F * 7) / (C * F)) * 2 + 1; -// c_sm = (((threadId - R * C * F * 7) % (C * F)) / F) * 2 + 1; -// f_sm = (((threadId - R * C * F * 7) % (C * F)) % F) * 2 + 1; -// r_gl = r / 2 + (threadId - R * C * F * 7) / (C * F); -// c_gl = c / 2 + ((threadId - R * C * F * 7) % (C * F)) / F; -// f_gl = f / 2 + ((threadId - R * C * F * 7) % (C * F)) % F; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr - nr_c && c_gl < nc - nc_c && f_gl < nf - nf_c) { -// res = dwrcf[get_idx(lddwrcf1, lddwrcf2, r_gl, c_gl, f_gl)]; -// T f1 = -// lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm - 1, f_sm - 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm - 1, f_sm + 1)], -// ratio_f_sm[f_sm - 1]); -// T f2 = -// lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm + 1, f_sm - 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm + 1, f_sm + 1)], -// ratio_f_sm[f_sm - 1]); -// T f3 = -// lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm - 1, f_sm - 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm - 1, f_sm + 1)], -// ratio_f_sm[f_sm - 1]); -// T f4 = -// lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm + 1, f_sm - 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm + 1, f_sm + 1)], -// ratio_f_sm[f_sm - 1]); - -// T fc1 = lerp(f1, f2, ratio_c_sm[c_sm - 1]); -// T fc2 = lerp(f3, f4, ratio_c_sm[c_sm - 1]); - -// res += lerp(fc1, fc2, ratio_r_sm[r_sm - 1]); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res; -// } -// } - -// if (r + R * 2 == nr_p - 1) { -// if (threadId < C * F) { -// if (dwf) { -// r_sm = R * 2; -// c_sm = (threadId / F) * 2; -// f_sm = (threadId % F) * 2 + 1; -// r_gl = r / 2 + R; -// c_gl = c / 2 + threadId / F; -// f_gl = f / 2 + threadId % F; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf - nf_c) { -// res = dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)]; -// res += lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm - 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm + 1)], -// ratio_f_sm[f_sm - 1]); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res; -// } -// } - -// if (dwc) { -// r_sm = R * 2; -// c_sm = (threadId / F) * 2 + 1; -// f_sm = (threadId % F) * 2; -// r_gl = r / 2 + R; -// c_gl = c / 2 + threadId / F; -// f_gl = f / 2 + threadId % F; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf_c) { -// res = dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)]; -// res += lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm)], -// ratio_c_sm[c_sm - 1]); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res; -// } -// } -// if (dwcf) { -// r_sm = R * 2; -// c_sm = (threadId / F) * 2 + 1; -// f_sm = (threadId % F) * 2 + 1; -// r_gl = r / 2 + R; -// c_gl = c / 2 + threadId / F; -// f_gl = f / 2 + threadId % F; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf - nf_c) { -// res = dwcf[get_idx(lddwcf1, lddwcf2, r_gl, c_gl, f_gl)]; -// T f1 = -// lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm - 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm + 1)], -// ratio_f_sm[f_sm - 1]); -// T f2 = -// lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm - 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm + 1)], -// ratio_f_sm[f_sm - 1]); -// res += lerp(f1, f2, ratio_c_sm[c_sm - 1]); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res; -// } -// } -// } -// } - -// if (c + C * 2 == nc_p - 1) { -// if (threadId >= R * C * F && threadId < R * C * F + R * F) { -// if (dwf) { -// r_sm = ((threadId - R * C * F) / F) * 2; -// c_sm = C * 2; -// f_sm = ((threadId - R * C * F) % F) * 2 + 1; -// r_gl = r / 2 + (threadId - R * C * F) / F; -// c_gl = c / 2 + C; -// f_gl = f / 2 + (threadId - R * C * F) % F; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf - nf_c) { -// res = dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)]; -// res += lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm - 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm + 1)], -// ratio_f_sm[f_sm - 1]); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res; -// } -// } -// if (dwr) { -// r_sm = ((threadId - R * C * F) / F) * 2 + 1; -// c_sm = C * 2; -// f_sm = ((threadId - R * C * F) % F) * 2; -// r_gl = r / 2 + (threadId - R * C * F) / F; -// c_gl = c / 2 + C; -// f_gl = f / 2 + (threadId - R * C * F) % F; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf_c) { -// res = dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)]; -// res += lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm)], -// ratio_r_sm[r_sm - 1]); -// // if (c_gl == nc_c-1 && f_gl == nf_c - 1) -// // printf("block: (%d %d %d) thread: (%d %d %d) calc_coeff1 (%d -// // %d %d): %f <- %f %f\n", blockIdx.z, blockIdx.y, blockIdx.x, -// // threadIdx.z, threadIdx.y, threadIdx.x, r_sm, c_sm, f_sm, -// // res, v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, -// // f_sm)], -// // v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm)]); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res; -// } -// } -// if (dwrf) { -// r_sm = ((threadId - R * C * F) / F) * 2 + 1; -// c_sm = C * 2; -// f_sm = ((threadId - R * C * F) % F) * 2 + 1; -// r_gl = r / 2 + (threadId - R * C * F) / F; -// c_gl = c / 2 + C; -// f_gl = f / 2 + (threadId - R * C * F) % F; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf - nf_c) { -// res = dwrf[get_idx(lddwrf1, lddwrf2, r_gl, c_gl, f_gl)]; -// T f1 = -// lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm - 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm + 1)], -// ratio_f_sm[f_sm - 1]); -// T f2 = -// lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm - 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm + 1)], -// ratio_f_sm[f_sm - 1]); -// res += lerp(f1, f2, ratio_r_sm[r_sm - 1]); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res; -// } -// } -// } -// } - -// if (f + F * 2 == nf_p - 1) { -// if (threadId >= R * C * F * 2 && threadId < R * C * F * 2 + R * C) { -// if (dwc) { -// r_sm = ((threadId - R * C * F * 2) / C) * 2; -// c_sm = ((threadId - R * C * F * 2) % C) * 2 + 1; -// f_sm = F * 2; -// r_gl = r / 2 + (threadId - R * C * F * 2) / C; -// c_gl = c / 2 + (threadId - R * C * F * 2) % C; -// f_gl = f / 2 + F; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf_c) { -// res = dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)]; -// res += lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm)], -// ratio_c_sm[c_sm - 1]); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res; -// } -// } - -// if (dwr) { -// r_sm = ((threadId - R * C * F * 2) / C) * 2 + 1; -// c_sm = ((threadId - R * C * F * 2) % C) * 2; -// f_sm = F * 2; -// r_gl = r / 2 + (threadId - R * C * F * 2) / C; -// c_gl = c / 2 + (threadId - R * C * F * 2) % C; -// f_gl = f / 2 + F; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf_c) { -// res = dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)]; -// res += lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm)], -// ratio_r_sm[r_sm - 1]); -// // if (c_gl == nc_c-1 && f_gl == nf_c - 1) -// // printf("block: (%d %d %d) thread: (%d %d %d) calc_coeff2 (%d -// // %d %d): %f <- %f %f\n", blockIdx.z, blockIdx.y, blockIdx.x, -// // threadIdx.z, threadIdx.y, threadIdx.x, r_sm, c_sm, f_sm, -// // res, v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, -// // f_sm)], -// // v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm)]); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res; -// } -// } - -// if (dwrc) { -// r_sm = ((threadId - R * C * F * 2) / C) * 2 + 1; -// c_sm = ((threadId - R * C * F * 2) % C) * 2 + 1; -// f_sm = F * 2; -// r_gl = r / 2 + (threadId - R * C * F * 2) / C; -// c_gl = c / 2 + (threadId - R * C * F * 2) % C; -// f_gl = f / 2 + F; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr - nr_c && c_gl < nc - nc_c && f_gl < nf_c) { -// res = dwrc[get_idx(lddwrc1, lddwrc2, r_gl, c_gl, f_gl)]; -// T c1 = -// lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm - 1, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm + 1, f_sm)], -// ratio_c_sm[c_sm - 1]); -// T c2 = -// lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm - 1, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm + 1, f_sm)], -// ratio_c_sm[c_sm - 1]); -// res += lerp(c1, c2, ratio_r_sm[r_sm - 1]); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res; -// } -// } -// } -// } - -// if (c + C * 2 == nc_p - 1 && f + F * 2 == nf_p - 1) { -// if (threadId >= R * C * F * 3 && threadId < R * C * F * 3 + R) { -// if (dwr) { -// r_sm = (threadId - R * C * F * 3) * 2 + 1; -// c_sm = C * 2; -// f_sm = F * 2; -// r_gl = r / 2 + threadId - R * C * F * 3; -// c_gl = c / 2 + C; -// f_gl = f / 2 + F; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr - nr_c && c_gl < nc_c && f_gl < nf_c) { -// res = dwr[get_idx(lddwr1, lddwr2, r_gl, c_gl, f_gl)]; -// res += lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm)], -// ratio_r_sm[r_sm - 1]); -// // if (c_gl == nc_c-1 && f_gl == nf_c - 1) -// // printf("block: (%d %d %d) thread: (%d %d %d) calc_coeff3 (%d -// // %d %d): %f <- %f %f\n", blockIdx.z, blockIdx.y, blockIdx.x, -// // threadIdx.z, threadIdx.y, threadIdx.x, r_sm, c_sm, f_sm, -// // res, v_sm[get_idx(ldsm1, ldsm2, r_sm - 1, c_sm, -// // f_sm)], -// // v_sm[get_idx(ldsm1, ldsm2, r_sm + 1, c_sm, f_sm)]); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res; -// } -// } -// } -// } - -// if (r + R * 2 == nr_p - 1 && f + F * 2 == nf_p - 1) { -// if (threadId >= R * C * F * 4 && threadId < R * C * F * 4 + C) { -// if (dwc) { -// r_sm = R * 2; -// c_sm = (threadId - R * C * F * 4) * 2 + 1; -// f_sm = F * 2; -// r_gl = r / 2 + R; -// c_gl = c / 2 + threadId - R * C * F * 4; -// f_gl = f / 2 + F; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc - nc_c && f_gl < nf_c) { -// res = dwc[get_idx(lddwc1, lddwc2, r_gl, c_gl, f_gl)]; -// res += lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm - 1, f_sm)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm + 1, f_sm)], -// ratio_c_sm[c_sm - 1]); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res; -// } -// } -// } -// } - -// if (r + R * 2 == nr_p - 1 && c + C * 2 == nc_p - 1) { -// if (threadId >= R * C * F * 5 && threadId < R * C * F * 5 + F) { -// if (dwf) { -// r_sm = R * 2; -// c_sm = C * 2; -// f_sm = (threadId - R * C * F * 5) * 2 + 1; -// r_gl = r / 2 + R; -// c_gl = c / 2 + C; -// f_gl = f / 2 + threadId - R * C * F * 5; -// if (r_sm < rest_r_p && c_sm < rest_c_p && f_sm < rest_f_p && -// r_gl < nr_c && c_gl < nc_c && f_gl < nf - nf_c) { -// res = dwf[get_idx(lddwf1, lddwf2, r_gl, c_gl, f_gl)]; -// res += lerp(v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm - 1)], -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm + 1)], -// ratio_f_sm[f_sm - 1]); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)] = res; -// } -// } -// } -// } - -// // __syncthreads(); -// // if (debug) { -// // printf("TYPE: %d %d %d %d\n", TYPE, -// // min(rest_r_p, R * 2 + 1), -// // min(rest_c_p, C * 2 + 1), -// // min(rest_f_p, F * 2 + 1)); -// // for (int i = 0; i < min(rest_r_p, R * 2 + 1); i++) { -// // for (int j = 0; j < min(rest_c_p, C * 2 + 1); j++) { -// // for (int k = 0; k < min(rest_f_p, F * 2 + 1); k++) { -// // printf("%2.2f ", v_sm[get_idx(ldsm1, ldsm2, i, j, k)]); -// // } -// // printf("\n"); -// // } -// // printf("\n"); -// // } -// // } -// // __syncthreads(); - -// __syncthreads(); - -// r_sm = threadIdx.z; -// c_sm = threadIdx.y; -// f_sm = threadIdx.x; - -// r_sm_ex = blockDim.z; -// c_sm_ex = blockDim.y; -// f_sm_ex = blockDim.x; - -// r_gl = r + r_sm; -// c_gl = c + c_sm; -// f_gl = f + f_sm; - -// // r_gl_ex = r + R * 2; -// // c_gl_ex = c + C * 2; -// // f_gl_ex = f + F * 2; - -// r_gl_ex = r + rest_r - 1; -// c_gl_ex = c + rest_c - 1; -// f_gl_ex = f + rest_f - 1; - -// int unpadding_r = rest_r; -// int unpadding_c = rest_c; -// int unpadding_f = rest_f; -// if (nr % 2 == 0) -// unpadding_r -= 1; -// if (nc % 2 == 0) -// unpadding_c -= 1; -// if (nf % 2 == 0) -// unpadding_f -= 1; - -// if (r_sm < unpadding_r && c_sm < unpadding_c && f_sm < unpadding_f) { - -// // store extra rules -// // case 1: input = odd (non-padding required) -// // case 1.a: block size + 1 == rest (need to store extra); -// // case 1.b: block size + 1 != rest (No need to store extra); -// // case 2: input = even (un-padding requried) -// // case 2.a: block size + 1 >= rest (No need to store extra, but need -// // un-padding first); case 2.b: block size + 1 < rest (No need to -// store -// // extra); - -// if (D >= 3 && r_sm == 0) { -// if (nr % 2 != 0 && R * 2 + 1 == rest_r) { -// dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl, f_gl)] = -// v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm, f_sm)]; -// } -// if (nr % 2 == 0 && R * 2 + 1 >= rest_r_p) { -// v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, c_sm, f_sm)] = -// v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm, f_sm)]; -// // if ( v_sm[get_idx(ldsm1, ldsm2, rest_r_p-1, c_sm, f_sm)] == -// 71177117) -// // printf("un-padding0 error block: (%d %d %d) thread: (%d %d %d) -// // un-padding (%d %d %d) %f (%d %d %d)\n", blockIdx.z, blockIdx.y, -// // blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, -// // rest_r-1, c_sm, f_sm, -// // v_sm[get_idx(ldsm1, ldsm2, rest_r_p-1, c_sm, f_sm)], -// rest_r_p-1, -// // c_sm, f_sm); -// } -// } - -// if (D >= 2 && c_sm == 0) { -// if (nc % 2 != 0 && C * 2 + 1 == rest_c) { -// dv[get_idx(lddv1, lddv2, r_gl, c_gl_ex, f_gl)] = -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm_ex, f_sm)]; -// } -// if (nc % 2 == 0 && C * 2 + 1 >= rest_c_p) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, rest_c - 1, f_sm)] = -// v_sm[get_idx(ldsm1, ldsm2, r_sm, rest_c_p - 1, f_sm)]; -// // if (v_sm[get_idx(ldsm1, ldsm2, r_sm, rest_c_p - 1, f_sm)] == -// 71177117) -// // printf("un-padding1 error block: (%d %d %d) thread: (%d %d %d) " -// // "un-padding (%d %d %d) %f (%d %d %d)\n", -// // blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, -// threadIdx.y, -// // threadIdx.x, r_sm, rest_c - 1, f_sm, -// // v_sm[get_idx(ldsm1, ldsm2, r_sm, rest_c_p - 1, f_sm)], -// r_sm, -// // rest_c_p - 1, f_sm); -// } -// } - -// if (D >= 1 && f_sm == 0) { -// if (nf % 2 != 0 && F * 2 + 1 == rest_f) { -// dv[get_idx(lddv1, lddv2, r_gl, c_gl, f_gl_ex)] = -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm_ex)]; -// } -// if (nf % 2 == 0 && F * 2 + 1 >= rest_f_p) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, rest_f - 1)] = -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, rest_f_p - 1)]; -// // if ( v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, rest_f_p-1)] == -// 71177117) -// // printf("un-padding2 error block: (%d %d %d) thread: (%d %d %d) -// // un-padding (%d %d %d) %f (%d %d %d)\n", blockIdx.z, blockIdx.y, -// // blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, -// // r_sm, c_sm, rest_f-1, -// // v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, rest_f_p-1)], r_sm, -// c_sm, -// // rest_f_p-1); -// } -// } - -// // load extra edges -// if (D >= 2 && c_sm == 0 && f_sm == 0) { -// if (nc % 2 != 0 && C * 2 + 1 == rest_c && nf % 2 != 0 && -// F * 2 + 1 == rest_f) { -// dv[get_idx(lddv1, lddv2, r_gl, c_gl_ex, f_gl_ex)] = -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm_ex, f_sm_ex)]; -// } -// if (nc % 2 == 0 && nf % 2 == 0 && C * 2 + 1 >= rest_c_p && -// F * 2 + 1 >= rest_f_p) { -// v_sm[get_idx(ldsm1, ldsm2, r_sm, rest_c - 1, rest_f - 1)] = -// v_sm[get_idx(ldsm1, ldsm2, r_sm, rest_c_p - 1, rest_f_p - 1)]; -// // printf("block: (%d %d %d) thread: (%d %d %d) un-padding (%d %d %d) -// %f -// // (%d %d %d)\n", blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, -// // threadIdx.y, threadIdx.x, r_sm, rest_c-1, rest_f-1, -// // v_sm[get_idx(ldsm1, ldsm2, r_sm, rest_c-1, rest_f-1)], r_sm, -// // rest_c_p-1, rest_f_p-1); -// } -// if (nc % 2 == 0 && nf % 2 != 0 && C * 2 + 1 >= rest_c_p && -// F * 2 + 1 == rest_f) { -// dv[get_idx(lddv1, lddv2, r_gl, c_gl_ex, f_gl_ex)] = -// v_sm[get_idx(ldsm1, ldsm2, r_sm, rest_c_p - 1, f_sm_ex)]; -// } -// if (nc % 2 != 0 && nf % 2 == 0 && C * 2 + 1 == rest_c && -// F * 2 + 1 >= rest_f_p) { -// dv[get_idx(lddv1, lddv2, r_gl, c_gl_ex, f_gl_ex)] = -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm_ex, rest_f_p - 1)]; -// // printf("(%d %d %d): %f <- (%d %d %d)\n", -// // r_gl, c_gl_ex, f_gl_ex, -// // dv[get_idx(lddv1, lddv2, r_gl, c_gl_ex, f_gl_ex)], -// // r_sm, c_sm_ex, f_gl_ex); -// } -// } - -// if (D >= 3 && r_sm == 0 && f_sm == 0) { -// if (nr % 2 != 0 && R * 2 + 1 == rest_r && nf % 2 != 0 && -// F * 2 + 1 == rest_f) { -// dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl, f_gl_ex)] = -// v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm, f_sm_ex)]; -// } -// if (nr % 2 == 0 && nf % 2 == 0 && R * 2 + 1 >= rest_r_p && -// F * 2 + 1 >= rest_f_p) { -// v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, c_sm, rest_f - 1)] = -// v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm, rest_f_p - 1)]; -// // if ( v_sm[get_idx(ldsm1, ldsm2, rest_r_p-1, c_sm, rest_f_p-1)] == -// // 71177117) printf("un-padding3 error block: (%d %d %d) thread: (%d -// %d -// // %d) un-padding (%d %d %d) %f (%d %d %d)\n", blockIdx.z, -// blockIdx.y, -// // blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, -// // rest_r-1, c_sm, rest_f-1, -// // v_sm[get_idx(ldsm1, ldsm2, rest_r_p-1, c_sm, rest_f_p-1)], -// // rest_r_p-1, c_sm, rest_f_p-1); -// } -// if (nr % 2 == 0 && nf % 2 != 0 && R * 2 + 1 >= rest_r_p && -// F * 2 + 1 == rest_f) { -// dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl, f_gl_ex)] = -// v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm, f_sm_ex)]; -// } -// if (nr % 2 != 0 && nf % 2 == 0 && R * 2 + 1 == rest_r && -// F * 2 + 1 >= rest_f_p) { -// dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl, f_gl_ex)] = -// v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm, rest_f_p - 1)]; -// // printf("(%d %d %d): %f <- (%d %d %d)\n", -// // r_gl_ex, c_gl, rest_f-1, -// // dv[get_idx(lddv1, lddv2, r_gl_ex-1, c_gl, f_gl_ex)], -// // r_sm_ex, c_sm, rest_f_p-1); -// } -// } - -// if (D >= 3 && r_sm == 0 && c_sm == 0) { -// if (nr % 2 != 0 && R * 2 + 1 == rest_r && nc % 2 != 0 && -// C * 2 + 1 == rest_c) { -// dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl)] = -// v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm_ex, f_sm)]; -// } -// if (nr % 2 == 0 && nc % 2 == 0 && R * 2 + 1 >= rest_r_p && -// C * 2 + 1 >= rest_c_p) { -// v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, rest_c - 1, f_sm)] = -// v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, rest_c_p - 1, f_sm)]; -// // if ( v_sm[get_idx(ldsm1, ldsm2, rest_r_p-1, rest_c_p-1, f_sm)] == -// // 71177117) printf("un-padding4 error block: (%d %d %d) thread: (%d -// %d -// // %d) un-padding (%d %d %d) %f (%d %d %d)\n", blockIdx.z, -// blockIdx.y, -// // blockIdx.x, threadIdx.z, threadIdx.y, threadIdx.x, -// // rest_r-1, rest_c-1, f_sm, -// // v_sm[get_idx(ldsm1, ldsm2, rest_r_p-1, rest_c_p-1, f_sm)], -// // rest_r_p-1, rest_c_p-1, f_sm); -// } -// if (nr % 2 == 0 && nc % 2 != 0 && R * 2 + 1 >= rest_r_p && -// C * 2 + 1 == rest_c) { -// dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl)] = -// v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm_ex, f_sm)]; -// } -// if (nr % 2 != 0 && nc % 2 == 0 && R * 2 + 1 == rest_r && -// C * 2 + 1 >= rest_c_p) { -// dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl)] = -// v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, rest_c_p - 1, f_sm)]; -// } -// } -// // load extra vertex - -// if (D >= 3 && r_sm == 0 && c_sm == 0 && f_sm == 0) { -// if (nr % 2 != 0 && R * 2 + 1 == rest_r && nc % 2 != 0 && -// C * 2 + 1 == rest_c && nf % 2 != 0 && F * 2 + 1 == rest_f) { -// dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl_ex)] = -// v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm_ex, f_sm_ex)]; -// } - -// if (nr % 2 == 0 && nc % 2 == 0 && nf % 2 == 0 && R * 2 + 1 >= rest_r_p -// && -// C * 2 + 1 >= rest_c_p && F * 2 + 1 >= rest_f_p) { -// v_sm[get_idx(ldsm1, ldsm2, rest_r - 1, rest_c - 1, rest_f - 1)] = -// v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, rest_c_p - 1, -// rest_f_p - 1)]; - -// // printf("block: (%d %d %d) thread: (%d %d %d) un-padding (%d %d %d) -// %f -// // (%d %d %d)\n", blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, -// // threadIdx.y, threadIdx.x, rest_r-1, rest_c-1, rest_f-1, -// // v_sm[get_idx(ldsm1, ldsm2, r_sm, rest_c-1, rest_f-1)], -// // rest_r_p-1, rest_c_p-1, rest_f_p-1); -// } -// if (nr % 2 == 0 && nc % 2 == 0 && nf % 2 != 0 && R * 2 + 1 >= rest_r_p -// && -// C * 2 + 1 >= rest_c_p && F * 2 + 1 == rest_f) { -// dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl_ex)] = -// v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, rest_c_p - 1, f_sm_ex)]; -// } -// if (nr % 2 == 0 && nc % 2 != 0 && nf % 2 == 0 && R * 2 + 1 >= rest_r_p -// && -// C * 2 + 1 == rest_c && F * 2 + 1 >= rest_f_p) { -// dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl_ex)] = -// v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm_ex, rest_f_p - 1)]; -// } -// if (nr % 2 != 0 && nc % 2 == 0 && nf % 2 == 0 && R * 2 + 1 == rest_r && -// C * 2 + 1 >= rest_c_p && F * 2 + 1 >= rest_f_p) { -// dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl_ex)] = -// v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, rest_c_p - 1, rest_f_p - 1)]; -// } -// if (nr % 2 == 0 && nc % 2 != 0 && nf % 2 != 0 && R * 2 + 1 >= rest_r_p -// && -// C * 2 + 1 == rest_c && F * 2 + 1 == rest_f) { -// dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl_ex)] = -// v_sm[get_idx(ldsm1, ldsm2, rest_r_p - 1, c_sm_ex, f_sm_ex)]; -// } -// if (nr % 2 != 0 && nc % 2 == 0 && nf % 2 != 0 && R * 2 + 1 == rest_r && -// C * 2 + 1 >= rest_c_p && F * 2 + 1 == rest_f) { -// dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl_ex)] = -// v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, rest_c_p - 1, f_sm_ex)]; -// } -// if (nr % 2 != 0 && nc % 2 != 0 && nf % 2 == 0 && R * 2 + 1 == rest_r && -// C * 2 + 1 == rest_c && F * 2 + 1 >= rest_f_p) { -// dv[get_idx(lddv1, lddv2, r_gl_ex, c_gl_ex, f_gl_ex)] = -// v_sm[get_idx(ldsm1, ldsm2, r_sm_ex, c_sm_ex, rest_f_p - 1)]; -// } -// } -// } - -// __syncthreads(); - -// if (r_sm < rest_r && c_sm < rest_c && f_sm < rest_f) { -// if (r_gl >= svr && r_gl < svr + nvr && c_gl >= svc && c_gl < svc + nvc && -// f_gl >= svf && f_gl < svf + nvf) { -// dv[get_idx(lddv1, lddv2, r_gl, c_gl, f_gl)] = -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)]; - -// // if (c_gl == nc - 1 && f_gl == nf - 1) { -// // printf("block: (%d %d %d) thread: (%d %d %d) store (%d %d %d) %f -// // (%d %d %d)\n", blockIdx.z, blockIdx.y, blockIdx.x, threadIdx.z, -// // threadIdx.y, threadIdx.x, r_gl, c_gl, f_gl, -// // v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm)], r_sm, c_sm, -// f_sm); -// // } -// } -// } -// } - -// template -// void gpk_rev_3d_adaptive_launcher( -// Handle &handle, SIZE nr, SIZE nc, SIZE nf, T *dratio_r, T -// *dratio_c, T *dratio_f, T *dv, SIZE lddv1, SIZE lddv2, T *dw, SIZE lddw1, -// SIZE lddw2, T *dwf, SIZE lddwf1, SIZE lddwf2, T *dwc, SIZE lddwc1, SIZE -// lddwc2, T *dwr, SIZE lddwr1, SIZE lddwr2, T *dwcf, SIZE lddwcf1, SIZE -// lddwcf2, T *dwrf, SIZE lddwrf1, SIZE lddwrf2, T *dwrc, SIZE lddwrc1, SIZE -// lddwrc2, T *dwrcf, SIZE lddwrcf1, SIZE lddwrcf2, SIZE svr, SIZE svc, SIZE -// svf, SIZE nvr, SIZE nvc, SIZE nvf, int queue_idx) { -// cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte); -// cudaDeviceSetCacheConfig(cudaFuncCachePreferShared); -// SIZE nr_c = nr / 2 + 1; -// SIZE nc_c = nc / 2 + 1; -// SIZE nf_c = nf / 2 + 1; -// SIZE total_thread_z = std::max(nr - 1, (SIZE)1); -// SIZE total_thread_y = std::max(nc - 1, (SIZE)1); -// SIZE total_thread_x = std::max(nf - 1, (SIZE)1); - -// SIZE tbx, tby, tbz, gridx, gridy, gridz; -// dim3 threadsPerBlock, blockPerGrid; -// size_t sm_size; - -// // tbz = std::min(R, total_thread_z); -// // tby = std::min(C, total_thread_y); -// // tbx = std::min(F, total_thread_x); -// tbz = R; -// tby = C; -// tbx = F; -// sm_size = ((R + 1) * (C + 1) * (F + 1) + R + C + F) * sizeof(T); -// gridz = ceil((float)total_thread_z / tbz); -// gridy = ceil((float)total_thread_y / tby); -// gridx = ceil((float)total_thread_x / tbx); -// threadsPerBlock = dim3(tbx, tby, tbz); -// blockPerGrid = dim3(gridx, gridy, gridz); -// // printf("prolongate exec: %d %d %d %d %d %d\n", tbx, tby, tbz, gridx, -// gridy, -// // gridz); -// _gpk_rev_3d -// <<>>( -// nr, nc, nf, nr_c, nc_c, nf_c, dratio_r, dratio_c, dratio_f, dv, -// lddv1, lddv2, dw, lddw1, lddw2, dwf, lddwf1, lddwf2, dwc, lddwc1, -// lddwc2, dwr, lddwr1, lddwr2, dwcf, lddwcf1, lddwcf2, dwrf, lddwrf1, -// lddwrf2, dwrc, lddwrc1, lddwrc2, dwrcf, lddwrcf1, lddwrcf2, svr, -// svc, svf, nvr, nvc, nvf); -// gpuErrchk(cudaGetLastError()); -// if (handle.sync_and_check_all_kernels) { -// gpuErrchk(cudaDeviceSynchronize()); -// } -// } - -// template -// void gpk_rev_3d(Handle &handle, SIZE nr, SIZE nc, SIZE nf, T *dratio_r, -// T *dratio_c, T *dratio_f, T *dv, SIZE lddv1, SIZE lddv2, T -// *dw, SIZE lddw1, SIZE lddw2, T *dwf, SIZE lddwf1, SIZE -// lddwf2, T *dwc, SIZE lddwc1, SIZE lddwc2, T *dwr, SIZE -// lddwr1, SIZE lddwr2, T *dwcf, SIZE lddwcf1, SIZE lddwcf2, T -// *dwrf, SIZE lddwrf1, SIZE lddwrf2, T *dwrc, SIZE lddwrc1, -// SIZE lddwrc2, T *dwrcf, SIZE lddwrcf1, SIZE lddwrcf2, SIZE -// svr, SIZE svc, SIZE svf, SIZE nvr, SIZE nvc, SIZE nvf, int -// queue_idx, int config) { - -// #define GPK(R, C, F) \ -// { \ -// gpk_rev_3d_adaptive_launcher( \ -// handle, nr, nc, nf, dratio_r, dratio_c, dratio_f, dv, lddv1, lddv2, \ -// dw, lddw1, lddw2, dwf, lddwf1, lddwf2,\ -// dwc, \ -// lddwc1, lddwc2, dwr, lddwr1, lddwr2, dwcf, lddwcf1, lddwcf2,\ -// dwrf, \ -// lddwrf1, lddwrf2, dwrc, lddwrc1, lddwrc2, dwrcf, lddwrcf1, lddwrcf2, \ -// svr, svc, svf, nvr, nvc, nvf, queue_idx); \ -// } -// bool profile = false; -// if (handle.profile_kernels) { -// profile = true; -// } -// if (D == 3) { -// if (profile || config == 6) { -// GPK(2, 2, 128) -// } -// if (profile || config == 5) { -// GPK(2, 2, 64) -// } -// if (profile || config == 4) { -// GPK(4, 4, 32) -// } -// if (profile || config == 3) { -// GPK(4, 4, 16) -// } -// if (profile || config == 2) { -// GPK(4, 4, 8) -// } -// if (profile || config == 1) { -// GPK(4, 4, 4) -// } -// if (profile || config == 0) { -// GPK(2, 2, 2) -// } -// } else if (D == 2) { -// if (profile || config == 6) { -// GPK(1, 2, 128) -// } -// if (profile || config == 5) { -// GPK(1, 2, 64) -// } -// if (profile || config == 4) { -// GPK(1, 4, 32) -// } -// if (profile || config == 3) { -// GPK(1, 4, 16) -// } -// if (profile || config == 2) { -// GPK(1, 4, 8) -// } -// if (profile || config == 1) { -// GPK(1, 4, 4) -// } -// if (profile || config == 0) { -// GPK(1, 2, 4) -// } -// } else if (D == 1) { -// if (profile || config == 6) { -// GPK(1, 1, 128) -// } -// if (profile || config == 5) { -// GPK(1, 1, 64) -// } -// if (profile || config == 4) { -// GPK(1, 1, 32) -// } -// if (profile || config == 3) { -// GPK(1, 1, 16) -// } -// if (profile || config == 2) { -// GPK(1, 1, 8) -// } -// if (profile || config == 1) { -// GPK(1, 1, 8) -// } -// if (profile || config == 0) { -// GPK(1, 1, 8) -// } -// } -// #undef GPK -// } - } // namespace mgard_x #endif \ No newline at end of file diff --git a/include/mgard-x/DataRefactoring/MultiDimension/CopyND/AddND.hpp b/include/mgard-x/DataRefactoring/MultiDimension/CopyND/AddND.hpp new file mode 100644 index 0000000000..2cabdfb5c0 --- /dev/null +++ b/include/mgard-x/DataRefactoring/MultiDimension/CopyND/AddND.hpp @@ -0,0 +1,29 @@ +/* + * Copyright 2022, Oak Ridge National Laboratory. + * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs + * Author: Jieyang Chen (chenj3@ornl.gov) + * Date: March 17, 2022 + */ + +#include "../../../Hierarchy/Hierarchy.hpp" +#include "../../../RuntimeX/RuntimeX.h" + +#include "../DataRefactoring.h" + +#include "../Correction/LevelwiseProcessingKernel.hpp" + +#ifndef MGARD_X_DATA_REFACTORING_ADD_ND +#define MGARD_X_DATA_REFACTORING_ADD_ND + +namespace mgard_x { + +template +void AddND(SubArray dinput, + SubArray &doutput, int queue_idx) { + + LwpkReo().Execute(dinput, doutput, queue_idx); +} + +} // namespace mgard_x + +#endif \ No newline at end of file diff --git a/include/mgard-x/DataRefactoring/MultiDimension/CopyND/CopyND.hpp b/include/mgard-x/DataRefactoring/MultiDimension/CopyND/CopyND.hpp new file mode 100644 index 0000000000..9783ded972 --- /dev/null +++ b/include/mgard-x/DataRefactoring/MultiDimension/CopyND/CopyND.hpp @@ -0,0 +1,29 @@ +/* + * Copyright 2022, Oak Ridge National Laboratory. + * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs + * Author: Jieyang Chen (chenj3@ornl.gov) + * Date: March 17, 2022 + */ + +#include "../../../Hierarchy/Hierarchy.hpp" +#include "../../../RuntimeX/RuntimeX.h" + +#include "../DataRefactoring.h" + +#include "../Correction/LevelwiseProcessingKernel.hpp" + +#ifndef MGARD_X_DATA_REFACTORING_COPY_ND +#define MGARD_X_DATA_REFACTORING_COPY_ND + +namespace mgard_x { + +template +void CopyND(SubArray dinput, + SubArray &doutput, int queue_idx) { + + LwpkReo().Execute(dinput, doutput, queue_idx); +} + +} // namespace mgard_x + +#endif \ No newline at end of file diff --git a/include/mgard-x/DataRefactoring/MultiDimension/CopyND/SubtractND.hpp b/include/mgard-x/DataRefactoring/MultiDimension/CopyND/SubtractND.hpp new file mode 100644 index 0000000000..8187f8b32f --- /dev/null +++ b/include/mgard-x/DataRefactoring/MultiDimension/CopyND/SubtractND.hpp @@ -0,0 +1,29 @@ +/* + * Copyright 2022, Oak Ridge National Laboratory. + * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs + * Author: Jieyang Chen (chenj3@ornl.gov) + * Date: March 17, 2022 + */ + +#include "../../../Hierarchy/Hierarchy.hpp" +#include "../../../RuntimeX/RuntimeX.h" + +#include "../DataRefactoring.h" + +#include "../Correction/LevelwiseProcessingKernel.hpp" + +#ifndef MGARD_X_DATA_REFACTORING_SUBTRACT_ND +#define MGARD_X_DATA_REFACTORING_SUBTRACT_ND + +namespace mgard_x { + +template +void SubtractND(SubArray dinput, + SubArray &doutput, int queue_idx) { + + LwpkReo().Execute(dinput, doutput, queue_idx); +} + +} // namespace mgard_x + +#endif \ No newline at end of file diff --git a/include/mgard-x/DataRefactoring/MultiDimension/Correction/CalcCorrection3D.hpp b/include/mgard-x/DataRefactoring/MultiDimension/Correction/CalcCorrection3D.hpp new file mode 100644 index 0000000000..97cf131552 --- /dev/null +++ b/include/mgard-x/DataRefactoring/MultiDimension/Correction/CalcCorrection3D.hpp @@ -0,0 +1,184 @@ +/* + * Copyright 2022, Oak Ridge National Laboratory. + * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs + * Author: Jieyang Chen (chenj3@ornl.gov) + * Date: March 17, 2022 + */ + +#include "../../../Hierarchy/Hierarchy.hpp" +#include "../../../RuntimeX/RuntimeX.h" + +#include "../DataRefactoring.h" + +#include "IterativeProcessingKernel3D.hpp" +#include "LinearProcessingKernel3D.hpp" + +#ifndef MGARD_X_DATA_REFACTORING_CALC_CORRECTION_3D +#define MGARD_X_DATA_REFACTORING_CALC_CORRECTION_3D + +namespace mgard_x { + +template +void CalcCorrection3D(Hierarchy &hierarchy, + SubArray dcoeff, + SubArray &dcorrection, SIZE l, + int queue_idx) { + + std::string prefix = "decomp_"; + if (sizeof(T) == sizeof(double)) + prefix += "d_"; + if (sizeof(T) == sizeof(float)) + prefix += "f_"; + for (int d = 0; d < D; d++) + prefix += std::to_string(hierarchy.shape[d]) + "_"; + + SubArray dw_in1, dw_in2, dw_out; + + if (D >= 1) { + dw_in1 = dcoeff; + dw_in1.resize( + {hierarchy.dofs[0][l + 1], hierarchy.dofs[1][l], hierarchy.dofs[2][l]}); + dw_in2 = dcoeff; + dw_in2.offset({hierarchy.dofs[0][l + 1], 0, 0}); + dw_in2.resize({hierarchy.dofs[0][l] - hierarchy.dofs[0][l + 1], + hierarchy.dofs[1][l], hierarchy.dofs[2][l]}); + dw_out = dcorrection; + dw_out.resize( + {hierarchy.dofs[0][l + 1], hierarchy.dofs[1][l], hierarchy.dofs[2][l]}); + + Lpk1Reo3D().Execute( + hierarchy.dofs[2][l], hierarchy.dofs[1][l], hierarchy.dofs[0][l], + hierarchy.dofs[0][l + 1], hierarchy.dofs[2][l + 1], + hierarchy.dofs[1][l + 1], hierarchy.dofs[0][l + 1], + SubArray(hierarchy.dist_array[0][l]), + SubArray(hierarchy.ratio_array[0][l]), dw_in1, dw_in2, dw_out, + queue_idx); + + verify_matrix_cuda(hierarchy.dofs[2][l], hierarchy.dofs[1][l], + hierarchy.dofs[0][l + 1], dw_out.data(), dw_out.getLd(0), + dw_out.getLd(1), dw_out.getLd(0), + prefix + "lpk_reo_1_3d" + "_level_" + std::to_string(l), + multidim_refactoring_store, multidim_refactoring_verify); + + if (multidim_refactoring_debug_print) { + PrintSubarray("after mass_trans_multiply_1_cpt", dw_out); + } + } + + if (D >= 2) { + dw_in1 = dw_out; + dw_in1.resize({hierarchy.dofs[0][l + 1], hierarchy.dofs[1][l + 1], + hierarchy.dofs[2][l]}); + dw_in2 = dw_out; + dw_in2.offset({0, hierarchy.dofs[1][l + 1], 0}); + dw_in2.resize({hierarchy.dofs[0][l + 1], + hierarchy.dofs[1][l] - hierarchy.dofs[1][l + 1], + hierarchy.dofs[2][l]}); + dw_out.offset({hierarchy.dofs[0][l + 1], 0, 0}); + dw_out.resize({hierarchy.dofs[0][l + 1], hierarchy.dofs[1][l + 1], + hierarchy.dofs[2][l]}); + + Lpk2Reo3D().Execute( + hierarchy.dofs[2][l], hierarchy.dofs[1][l], hierarchy.dofs[0][l + 1], + hierarchy.dofs[1][l + 1], SubArray(hierarchy.dist_array[1][l]), + SubArray(hierarchy.ratio_array[1][l]), dw_in1, dw_in2, dw_out, + queue_idx); + + verify_matrix_cuda(hierarchy.dofs[2][l], hierarchy.dofs[1][l + 1], + hierarchy.dofs[0][l + 1], dw_out.data(), dw_out.getLd(0), + dw_out.getLd(1), dw_out.getLd(0), + prefix + "lpk_reo_2_3d" + "_level_" + std::to_string(l), + multidim_refactoring_store, multidim_refactoring_verify); + + if (multidim_refactoring_debug_print) { + PrintSubarray("after mass_trans_multiply_2_cpt", dw_out); + } + } + + if (D == 3) { + dw_in1 = dw_out; + dw_in1.resize({hierarchy.dofs[0][l + 1], hierarchy.dofs[1][l + 1], + hierarchy.dofs[2][l + 1]}); + dw_in2 = dw_out; + dw_in2.offset({0, 0, hierarchy.dofs[2][l + 1]}); + dw_in2.resize({hierarchy.dofs[0][l + 1], hierarchy.dofs[1][l + 1], + hierarchy.dofs[2][l] - hierarchy.dofs[2][l + 1]}); + dw_out.offset({hierarchy.dofs[0][l + 1], hierarchy.dofs[1][l + 1], 0}); + dw_out.resize({hierarchy.dofs[0][l + 1], hierarchy.dofs[1][l + 1], + hierarchy.dofs[2][l + 1]}); + + Lpk3Reo3D().Execute( + hierarchy.dofs[2][l], hierarchy.dofs[1][l + 1], + hierarchy.dofs[0][l + 1], hierarchy.dofs[2][l + 1], + SubArray(hierarchy.dist_array[2][l]), + SubArray(hierarchy.ratio_array[2][l]), dw_in1, dw_in2, dw_out, + queue_idx); + + verify_matrix_cuda(hierarchy.dofs[2][l + 1], hierarchy.dofs[1][l + 1], + hierarchy.dofs[0][l + 1], dw_out.data(), dw_out.getLd(0), + dw_out.getLd(1), dw_out.getLd(0), + prefix + "lpk_reo_3_3d" + "_level_" + std::to_string(l), + multidim_refactoring_store, multidim_refactoring_verify); + + if (multidim_refactoring_debug_print) { + PrintSubarray("after mass_trans_multiply_3_cpt", dw_out); + } + } + + if (D >= 1) { + Ipk1Reo3D().Execute( + hierarchy.dofs[2][l + 1], hierarchy.dofs[1][l + 1], + hierarchy.dofs[0][l + 1], SubArray(hierarchy.am_array[0][l + 1]), + SubArray(hierarchy.bm_array[0][l + 1]), + SubArray(hierarchy.dist_array[0][l + 1]), dw_out, queue_idx); + verify_matrix_cuda(hierarchy.dofs[2][l + 1], hierarchy.dofs[1][l + 1], + hierarchy.dofs[0][l + 1], dw_out.data(), dw_out.getLd(0), + dw_out.getLd(1), dw_out.getLd(0), + prefix + "ipk_1_3d" + "_level_" + std::to_string(l), + multidim_refactoring_store, multidim_refactoring_verify); + + if (multidim_refactoring_debug_print) { + PrintSubarray("after solve_tridiag_1_cpt", dw_out); + } + } + if (D >= 2) { + Ipk2Reo3D().Execute( + hierarchy.dofs[2][l + 1], hierarchy.dofs[1][l + 1], + hierarchy.dofs[0][l + 1], SubArray(hierarchy.am_array[1][l + 1]), + SubArray(hierarchy.bm_array[1][l + 1]), + SubArray(hierarchy.dist_array[1][l + 1]), dw_out, queue_idx); + + verify_matrix_cuda(hierarchy.dofs[2][l + 1], hierarchy.dofs[1][l + 1], + hierarchy.dofs[0][l + 1], dw_out.data(), dw_out.getLd(0), + dw_out.getLd(1), dw_out.getLd(0), + prefix + "ipk_2_3d" + "_level_" + std::to_string(l), + multidim_refactoring_store, multidim_refactoring_verify); + + if (multidim_refactoring_debug_print) { + PrintSubarray("after solve_tridiag_2_cpt", dw_out); + } + } + if (D == 3) { + Ipk3Reo3D().Execute( + hierarchy.dofs[2][l + 1], hierarchy.dofs[1][l + 1], + hierarchy.dofs[0][l + 1], SubArray(hierarchy.am_array[2][l + 1]), + SubArray(hierarchy.bm_array[2][l + 1]), + SubArray(hierarchy.dist_array[2][l + 1]), dw_out, queue_idx); + + verify_matrix_cuda(hierarchy.dofs[2][l + 1], hierarchy.dofs[1][l + 1], + hierarchy.dofs[0][l + 1], dw_out.data(), dw_out.getLd(0), + dw_out.getLd(1), dw_out.getLd(0), + prefix + "ipk_3_3d" + "_level_" + std::to_string(l), + multidim_refactoring_store, multidim_refactoring_verify); + + if (multidim_refactoring_debug_print) { + PrintSubarray("after solve_tridiag_3_cpt", dw_out); + } + } + // final correction output + dcorrection = dw_out; +} + +} // namespace mgard_x + +#endif \ No newline at end of file diff --git a/include/mgard-x/DataRefactoring/MultiDimension/Correction/CalcCorrectionND.hpp b/include/mgard-x/DataRefactoring/MultiDimension/Correction/CalcCorrectionND.hpp new file mode 100644 index 0000000000..0fd807ea9e --- /dev/null +++ b/include/mgard-x/DataRefactoring/MultiDimension/Correction/CalcCorrectionND.hpp @@ -0,0 +1,228 @@ +/* + * Copyright 2022, Oak Ridge National Laboratory. + * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs + * Author: Jieyang Chen (chenj3@ornl.gov) + * Date: March 17, 2022 + */ + +#include "../../../Hierarchy/Hierarchy.hpp" +#include "../../../RuntimeX/RuntimeX.h" + +#include "../DataRefactoring.h" + +#include "IterativeProcessingKernel.hpp" +#include "LinearProcessingKernel.hpp" + +#ifndef MGARD_X_DATA_REFACTORING_CALC_CORRECTION_ND +#define MGARD_X_DATA_REFACTORING_CALC_CORRECTION_ND + +namespace mgard_x { + +template +void CalcCorrectionND(Hierarchy &hierarchy, + SubArray dcoeff, + SubArray &dcorrection, SIZE l, + int queue_idx) { + + std::string prefix = "decomp_"; + if (sizeof(T) == sizeof(double)) + prefix += "d_"; + if (sizeof(T) == sizeof(float)) + prefix += "f_"; + for (int d = 0; d < D; d++) + prefix += std::to_string(hierarchy.shape[d]) + "_"; + + SubArray dw_in1 = dcoeff; + SubArray dw_in2 = dcoeff; + SubArray dw_out = dcorrection; + + // start correction calculation + int prev_dim_r, prev_dim_c, prev_dim_f; + int curr_dim_f = 0, curr_dim_c = 1, curr_dim_r = 2; + + dw_in1.resize(curr_dim_f, hierarchy.dofs[curr_dim_f][l + 1]); + dw_in2.offset(curr_dim_f, hierarchy.dofs[curr_dim_f][l + 1]); + dw_in2.resize(curr_dim_f, hierarchy.dofs[curr_dim_f][l] - + hierarchy.dofs[curr_dim_f][l + 1]); + dw_out.resize(curr_dim_f, hierarchy.dofs[curr_dim_f][l + 1]); + + dw_in1.project(curr_dim_f, curr_dim_c, curr_dim_r); + dw_in2.project(curr_dim_f, curr_dim_c, curr_dim_r); + dw_out.project(curr_dim_f, curr_dim_c, curr_dim_r); + + Lpk1Reo().Execute( + SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true), + SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true), + hierarchy.processed_n[0], + SubArray<1, SIZE, DeviceType>(hierarchy.processed_dims[0], true), + curr_dim_r, curr_dim_c, curr_dim_f, + SubArray(hierarchy.dist_array[curr_dim_f][l]), + SubArray(hierarchy.ratio_array[curr_dim_f][l]), dw_in1, dw_in2, dw_out, + queue_idx); + + if (multidim_refactoring_debug_print) { // debug + PrintSubarray4D(format("decomposition: after MR-1D[{}]", l), dw_out); + } + + // mass trans 2D + prev_dim_f = curr_dim_f; + prev_dim_c = curr_dim_c; + prev_dim_r = curr_dim_r; + curr_dim_f = 0, curr_dim_c = 1, curr_dim_r = 2; + + dw_in1 = dw_out; + dw_in2 = dw_out; + dw_in1.resize(curr_dim_c, hierarchy.dofs[curr_dim_c][l + 1]); + dw_in2.offset(curr_dim_c, hierarchy.dofs[curr_dim_c][l + 1]); + dw_in2.resize(curr_dim_c, hierarchy.dofs[curr_dim_c][l] - + hierarchy.dofs[curr_dim_c][l + 1]); + dw_out.offset(prev_dim_f, hierarchy.dofs[curr_dim_f][l + 1]); + dw_out.resize(curr_dim_c, hierarchy.dofs[curr_dim_c][l + 1]); + + dw_in1.project(curr_dim_f, curr_dim_c, curr_dim_r); + dw_in2.project(curr_dim_f, curr_dim_c, curr_dim_r); + dw_out.project(curr_dim_f, curr_dim_c, curr_dim_r); + + Lpk2Reo().Execute( + SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true), + SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true), + hierarchy.processed_n[1], + SubArray<1, SIZE, DeviceType>(hierarchy.processed_dims[1], true), + curr_dim_r, curr_dim_c, curr_dim_f, + SubArray(hierarchy.dist_array[curr_dim_c][l]), + SubArray(hierarchy.ratio_array[curr_dim_c][l]), dw_in1, dw_in2, dw_out, + queue_idx); + + if (multidim_refactoring_debug_print) { // debug + PrintSubarray4D(format("decomposition: after MR-2D[{}]", l), dw_out); + } + + // mass trans 3D + + prev_dim_f = curr_dim_f; + prev_dim_c = curr_dim_c; + prev_dim_r = curr_dim_r; + curr_dim_f = 0, curr_dim_c = 1, curr_dim_r = 2; + + dw_in1 = dw_out; + dw_in2 = dw_out; + dw_in1.resize(curr_dim_r, hierarchy.dofs[curr_dim_r][l + 1]); + dw_in2.offset(curr_dim_r, hierarchy.dofs[curr_dim_r][l + 1]); + dw_in2.resize(curr_dim_r, hierarchy.dofs[curr_dim_r][l] - + hierarchy.dofs[curr_dim_r][l + 1]); + dw_out.offset(prev_dim_c, hierarchy.dofs[curr_dim_c][l + 1]); + dw_out.resize(curr_dim_r, hierarchy.dofs[curr_dim_r][l + 1]); + + dw_in1.project(curr_dim_f, curr_dim_c, curr_dim_r); + dw_in2.project(curr_dim_f, curr_dim_c, curr_dim_r); + dw_out.project(curr_dim_f, curr_dim_c, curr_dim_r); + + Lpk3Reo().Execute( + SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true), + SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true), + hierarchy.processed_n[2], + SubArray<1, SIZE, DeviceType>(hierarchy.processed_dims[2], true), + curr_dim_r, curr_dim_c, curr_dim_f, + SubArray(hierarchy.dist_array[curr_dim_r][l]), + SubArray(hierarchy.ratio_array[curr_dim_r][l]), dw_in1, dw_in2, dw_out, + queue_idx); + + if (multidim_refactoring_debug_print) { // debug + PrintSubarray4D(format("decomposition: after MR-3D[{}]", l), dw_out); + } + + // mass trans 4D+ + for (int i = 3; i < D; i++) { + prev_dim_f = curr_dim_f; + prev_dim_c = curr_dim_c; + prev_dim_r = curr_dim_r; + curr_dim_f = 0, curr_dim_c = 1, curr_dim_r = i; + dw_in1 = dw_out; + dw_in2 = dw_out; + dw_in1.resize(curr_dim_r, hierarchy.dofs[curr_dim_r][l + 1]); + dw_in2.offset(curr_dim_r, hierarchy.dofs[curr_dim_r][l + 1]); + dw_in2.resize(curr_dim_r, hierarchy.dofs[curr_dim_r][l] - + hierarchy.dofs[curr_dim_r][l + 1]); + dw_out.offset(prev_dim_r, hierarchy.dofs[prev_dim_r][l + 1]); + dw_out.resize(curr_dim_r, hierarchy.dofs[curr_dim_r][l + 1]); + + dw_in1.project(curr_dim_f, curr_dim_c, curr_dim_r); + dw_in2.project(curr_dim_f, curr_dim_c, curr_dim_r); + dw_out.project(curr_dim_f, curr_dim_c, curr_dim_r); + Lpk3Reo().Execute( + SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true), + SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true), + hierarchy.processed_n[i], + SubArray<1, SIZE, DeviceType>(hierarchy.processed_dims[i], true), + curr_dim_r, curr_dim_c, curr_dim_f, + SubArray(hierarchy.dist_array[curr_dim_r][l]), + SubArray(hierarchy.ratio_array[curr_dim_r][l]), dw_in1, dw_in2, dw_out, + queue_idx); + + if (multidim_refactoring_debug_print) { // debug + PrintSubarray4D(format("decomposition: after MR-{}D[{}]", i + 1, l), + dw_out); + } + } + + curr_dim_f = 0, curr_dim_c = 1, curr_dim_r = 2; + dw_in1.project(curr_dim_f, curr_dim_c, curr_dim_r); + dw_in2.project(curr_dim_f, curr_dim_c, curr_dim_r); + dw_out.project(curr_dim_f, curr_dim_c, curr_dim_r); + Ipk1Reo().Execute( + curr_dim_r, curr_dim_c, curr_dim_f, + SubArray(hierarchy.am_array[curr_dim_f][l + 1]), + SubArray(hierarchy.bm_array[curr_dim_f][l + 1]), dw_out, queue_idx); + + if (multidim_refactoring_debug_print) { // debug + PrintSubarray4D(format("decomposition: after TR-1D[{}]", l), dw_out); + } // debug + + curr_dim_f = 0, curr_dim_c = 1, curr_dim_r = 2; + dw_in1.project(curr_dim_f, curr_dim_c, curr_dim_r); + dw_in2.project(curr_dim_f, curr_dim_c, curr_dim_r); + dw_out.project(curr_dim_f, curr_dim_c, curr_dim_r); + Ipk2Reo().Execute( + curr_dim_r, curr_dim_c, curr_dim_f, + SubArray(hierarchy.am_array[curr_dim_c][l + 1]), + SubArray(hierarchy.bm_array[curr_dim_c][l + 1]), dw_out, queue_idx); + + if (multidim_refactoring_debug_print) { // debug + PrintSubarray4D(format("decomposition: after TR-2D[{}]", l), dw_out); + } // debug + + curr_dim_f = 0, curr_dim_c = 1, curr_dim_r = 2; + dw_in1.project(curr_dim_f, curr_dim_c, curr_dim_r); + dw_in2.project(curr_dim_f, curr_dim_c, curr_dim_r); + dw_out.project(curr_dim_f, curr_dim_c, curr_dim_r); + Ipk3Reo().Execute( + curr_dim_r, curr_dim_c, curr_dim_f, + SubArray(hierarchy.am_array[curr_dim_r][l + 1]), + SubArray(hierarchy.bm_array[curr_dim_r][l + 1]), dw_out, queue_idx); + + if (multidim_refactoring_debug_print) { // debug + PrintSubarray4D(format("decomposition: after TR-3D[{}]", l), dw_out); + } // debug + + // mass trans 4D+ + for (int i = 3; i < D; i++) { + curr_dim_f = 0, curr_dim_c = 1, curr_dim_r = i; + dw_in1.project(curr_dim_f, curr_dim_c, curr_dim_r); + dw_in2.project(curr_dim_f, curr_dim_c, curr_dim_r); + dw_out.project(curr_dim_f, curr_dim_c, curr_dim_r); + Ipk3Reo().Execute( + curr_dim_r, curr_dim_c, curr_dim_f, + SubArray(hierarchy.am_array[curr_dim_r][l + 1]), + SubArray(hierarchy.bm_array[curr_dim_r][l + 1]), dw_out, queue_idx); + if (multidim_refactoring_debug_print) { // debug + PrintSubarray4D(format("decomposition: after TR-{}D[{}]", i + 1, l), + dw_out); + } // debug + } + + dcorrection = dw_out; +} + +} // namespace mgard_x + +#endif \ No newline at end of file diff --git a/include/mgard-x/DataRefactoring/MultiDimension/Correction/IterativeProcessingKernel.h b/include/mgard-x/DataRefactoring/MultiDimension/Correction/IterativeProcessingKernel.h deleted file mode 100644 index e388e184b3..0000000000 --- a/include/mgard-x/DataRefactoring/MultiDimension/Correction/IterativeProcessingKernel.h +++ /dev/null @@ -1,37 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#ifndef MGARD_X_ITERATIVE_PROCESSING_KERNEL -#define MGARD_X_ITERATIVE_PROCESSING_KERNEL - -#include "../../Common.h" - -namespace mgard_x { - -template -void ipk_1(Handle &handle, SIZE *shape_h, SIZE *shape_c_h, SIZE *shape_d, - SIZE *shape_c_d, SIZE *ldvs, SIZE *ldws, DIM processed_n, - DIM *processed_dims_h, DIM *processed_dims_d, DIM curr_dim_r, - DIM curr_dim_c, DIM curr_dim_f, T *am, T *bm, T *ddist_f, T *dv, - LENGTH lddv1, LENGTH lddv2, int queue_idx, int config); - -template -void ipk_2(Handle &handle, SIZE *shape_h, SIZE *shape_c_h, SIZE *shape_d, - SIZE *shape_c_d, SIZE *ldvs, SIZE *ldws, DIM processed_n, - DIM *processed_dims_h, DIM *processed_dims_d, DIM curr_dim_r, - DIM curr_dim_c, DIM curr_dim_f, T *am, T *bm, T *ddist_c, T *dv, - LENGTH lddv1, LENGTH lddv2, int queue_idx, int config); - -template -void ipk_3(Handle &handle, SIZE *shape_h, SIZE *shape_c_h, SIZE *shape_d, - SIZE *shape_c_d, SIZE *ldvs, SIZE *ldws, DIM processed_n, - DIM *processed_dims_h, DIM *processed_dims_d, DIM curr_dim_r, - DIM curr_dim_c, DIM curr_dim_f, T *am, T *bm, T *ddist_r, T *dv, - LENGTH lddv1, LENGTH lddv2, int queue_idx, int config); -} // namespace mgard_x - -#endif \ No newline at end of file diff --git a/include/mgard-x/DataRefactoring/MultiDimension/Correction/IterativeProcessingKernel.hpp b/include/mgard-x/DataRefactoring/MultiDimension/Correction/IterativeProcessingKernel.hpp index 1d07134ee8..cb2c4a3029 100644 --- a/include/mgard-x/DataRefactoring/MultiDimension/Correction/IterativeProcessingKernel.hpp +++ b/include/mgard-x/DataRefactoring/MultiDimension/Correction/IterativeProcessingKernel.hpp @@ -618,14 +618,11 @@ class Ipk1Reo : public AutoTuner { SubArray<1, T, DeviceType> am, SubArray<1, T, DeviceType> bm, SubArray v, int queue_idx) { int range_l = std::min(6, (int)std::log2(v.getShape(curr_dim_f)) - 1); - int arch = DeviceRuntime::GetArchitectureGeneration(); int prec = TypeToIdx(); - // int config = - // AutoTuner::autoTuningTable.auto_tuning_ts1[arch][prec][range_l]; int config = AutoTuner::autoTuningTable.ipk1_nd[prec][range_l]; - double min_time = std::numeric_limits::max(); int min_config = 0; + ExecutionReturn ret; #define IPK(CONFIG) \ if (config == CONFIG || AutoTuner::ProfileKernels) { \ @@ -638,22 +635,26 @@ class Ipk1Reo : public AutoTuner { TaskType task = GenTask(curr_dim_r, curr_dim_c, curr_dim_f, \ am, bm, v, queue_idx); \ DeviceAdapter adapter; \ - ExecutionReturn ret = adapter.Execute(task); \ + ret = adapter.Execute(task); \ if (AutoTuner::ProfileKernels) { \ - if (min_time > ret.execution_time) { \ + if (ret.success && min_time > ret.execution_time) { \ min_time = ret.execution_time; \ min_config = CONFIG; \ } \ } \ } - IPK(0) - IPK(1) - IPK(2) - IPK(3) - IPK(4) - IPK(5) - IPK(6) + IPK(6) if (!ret.success) config--; + IPK(5) if (!ret.success) config--; + IPK(4) if (!ret.success) config--; + IPK(3) if (!ret.success) config--; + IPK(2) if (!ret.success) config--; + IPK(1) if (!ret.success) config--; + IPK(0) if (!ret.success) config--; + if (config < 0 && !ret.success) { + std::cout << log::log_err << "no suitable config for Ipk1Reo.\n"; + exit(-1); + } #undef IPK if (AutoTuner::ProfileKernels) { @@ -1320,14 +1321,11 @@ class Ipk2Reo : public AutoTuner { SubArray<1, T, DeviceType> am, SubArray<1, T, DeviceType> bm, SubArray v, int queue_idx) { int range_l = std::min(6, (int)std::log2(v.getShape(curr_dim_f)) - 1); - int arch = DeviceRuntime::GetArchitectureGeneration(); int prec = TypeToIdx(); - // int config = - // AutoTuner::autoTuningTable.auto_tuning_ts2[arch][prec][range_l]; int config = AutoTuner::autoTuningTable.ipk2_nd[prec][range_l]; - double min_time = std::numeric_limits::max(); int min_config = 0; + ExecutionReturn ret; #define IPK(CONFIG) \ if (config == CONFIG || AutoTuner::ProfileKernels) { \ @@ -1340,22 +1338,26 @@ class Ipk2Reo : public AutoTuner { TaskType task = GenTask(curr_dim_r, curr_dim_c, curr_dim_f, \ am, bm, v, queue_idx); \ DeviceAdapter adapter; \ - ExecutionReturn ret = adapter.Execute(task); \ + ret = adapter.Execute(task); \ if (AutoTuner::ProfileKernels) { \ - if (min_time > ret.execution_time) { \ + if (ret.success && min_time > ret.execution_time) { \ min_time = ret.execution_time; \ min_config = CONFIG; \ } \ } \ } - IPK(0) - IPK(1) - IPK(2) - IPK(3) - IPK(4) - IPK(5) - IPK(6) + IPK(6) if (!ret.success) config--; + IPK(5) if (!ret.success) config--; + IPK(4) if (!ret.success) config--; + IPK(3) if (!ret.success) config--; + IPK(2) if (!ret.success) config--; + IPK(1) if (!ret.success) config--; + IPK(0) if (!ret.success) config--; + if (config < 0 && !ret.success) { + std::cout << log::log_err << "no suitable config for Ipk2Reo.\n"; + exit(-1); + } #undef IPK if (AutoTuner::ProfileKernels) { @@ -2047,14 +2049,11 @@ class Ipk3Reo : public AutoTuner { SubArray<1, T, DeviceType> am, SubArray<1, T, DeviceType> bm, SubArray v, int queue_idx) { int range_l = std::min(6, (int)std::log2(v.getShape(curr_dim_f)) - 1); - int arch = DeviceRuntime::GetArchitectureGeneration(); int prec = TypeToIdx(); - // int config = - // AutoTuner::autoTuningTable.auto_tuning_ts3[arch][prec][range_l]; int config = AutoTuner::autoTuningTable.ipk3_nd[prec][range_l]; - double min_time = std::numeric_limits::max(); int min_config = 0; + ExecutionReturn ret; #define IPK(CONFIG) \ if (config == CONFIG || AutoTuner::ProfileKernels) { \ @@ -2067,22 +2066,26 @@ class Ipk3Reo : public AutoTuner { TaskType task = GenTask(curr_dim_r, curr_dim_c, curr_dim_f, \ am, bm, v, queue_idx); \ DeviceAdapter adapter; \ - ExecutionReturn ret = adapter.Execute(task); \ + ret = adapter.Execute(task); \ if (AutoTuner::ProfileKernels) { \ - if (min_time > ret.execution_time) { \ + if (ret.success && min_time > ret.execution_time) { \ min_time = ret.execution_time; \ min_config = CONFIG; \ } \ } \ } - IPK(0) - IPK(1) - IPK(2) - IPK(3) - IPK(4) - IPK(5) - IPK(6) + IPK(6) if (!ret.success) config--; + IPK(5) if (!ret.success) config--; + IPK(4) if (!ret.success) config--; + IPK(3) if (!ret.success) config--; + IPK(2) if (!ret.success) config--; + IPK(1) if (!ret.success) config--; + IPK(0) if (!ret.success) config--; + if (config < 0 && !ret.success) { + std::cout << log::log_err << "no suitable config for Ipk3Reo.\n"; + exit(-1); + } #undef IPK if (AutoTuner::ProfileKernels) { diff --git a/include/mgard-x/DataRefactoring/MultiDimension/Correction/IterativeProcessingKernel3D.h b/include/mgard-x/DataRefactoring/MultiDimension/Correction/IterativeProcessingKernel3D.h deleted file mode 100644 index bb2a31552f..0000000000 --- a/include/mgard-x/DataRefactoring/MultiDimension/Correction/IterativeProcessingKernel3D.h +++ /dev/null @@ -1,32 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#ifndef MGARD_X_ITERATIVE_PROCESSING_KERNEL_3D -#define MGARD_X_ITERATIVE_PROCESSING_KERNEL_3D - -#include "../../Common.h" - -namespace mgard_x { - -template -void ipk_1_3d(Handle &handle, SIZE nr, SIZE nc, SIZE nf_c, T *am, T *bm, - T *ddist_f, T *dv, SIZE lddv1, SIZE lddv2, int queue_idx, - int config); - -template -void ipk_2_3d(Handle &handle, SIZE nr, SIZE nc_c, SIZE nf_c, T *am, T *bm, - T *ddist_c, T *dv, SIZE lddv1, SIZE lddv2, int queue_idx, - int config); - -template -void ipk_3_3d(Handle &handle, SIZE nr_c, SIZE nc_c, SIZE nf_c, T *am, - T *bm, T *ddist_r, T *dv, SIZE lddv1, SIZE lddv2, int queue_idx, - int config); - -} // namespace mgard_x - -#endif \ No newline at end of file diff --git a/include/mgard-x/DataRefactoring/MultiDimension/Correction/IterativeProcessingKernel3D.hpp b/include/mgard-x/DataRefactoring/MultiDimension/Correction/IterativeProcessingKernel3D.hpp index 655aa451bb..259a03f575 100644 --- a/include/mgard-x/DataRefactoring/MultiDimension/Correction/IterativeProcessingKernel3D.hpp +++ b/include/mgard-x/DataRefactoring/MultiDimension/Correction/IterativeProcessingKernel3D.hpp @@ -401,14 +401,11 @@ class Ipk1Reo3D : public AutoTuner { SubArray<1, T, DeviceType> bm, SubArray<1, T, DeviceType> dist_f, SubArray v, int queue_idx) { int range_l = std::min(6, (int)std::log2(nf) - 1); - int arch = DeviceRuntime::GetArchitectureGeneration(); int prec = TypeToIdx(); - // int config = - // AutoTuner::autoTuningTable.auto_tuning_ts1[arch][prec][range_l]; int config = AutoTuner::autoTuningTable.ipk1_3d[prec][range_l]; - double min_time = std::numeric_limits::max(); int min_config = 0; + ExecutionReturn ret; #define IPK(CONFIG) \ if (config == CONFIG || AutoTuner::ProfileKernels) { \ @@ -421,22 +418,26 @@ class Ipk1Reo3D : public AutoTuner { TaskType task = \ GenTask(nr, nc, nf, am, bm, dist_f, v, queue_idx); \ DeviceAdapter adapter; \ - ExecutionReturn ret = adapter.Execute(task); \ + ret = adapter.Execute(task); \ if (AutoTuner::ProfileKernels) { \ - if (min_time > ret.execution_time) { \ + if (ret.success && min_time > ret.execution_time) { \ min_time = ret.execution_time; \ min_config = CONFIG; \ } \ } \ } - IPK(0) - IPK(1) - IPK(2) - IPK(3) - IPK(4) - IPK(5) - IPK(6) + IPK(6) if (!ret.success) config--; + IPK(5) if (!ret.success) config--; + IPK(4) if (!ret.success) config--; + IPK(3) if (!ret.success) config--; + IPK(2) if (!ret.success) config--; + IPK(1) if (!ret.success) config--; + IPK(0) if (!ret.success) config--; + if (config < 0 && !ret.success) { + std::cout << log::log_err << "no suitable config for Ipk1Reo3D.\n"; + exit(-1); + } #undef IPK if (AutoTuner::ProfileKernels) { @@ -810,14 +811,11 @@ class Ipk2Reo3D : public AutoTuner { SubArray<1, T, DeviceType> bm, SubArray<1, T, DeviceType> dist_c, SubArray v, int queue_idx) { int range_l = std::min(6, (int)std::log2(nf) - 1); - int arch = DeviceRuntime::GetArchitectureGeneration(); int prec = TypeToIdx(); - // int config = - // AutoTuner::autoTuningTable.auto_tuning_ts2[arch][prec][range_l]; int config = AutoTuner::autoTuningTable.ipk2_3d[prec][range_l]; - double min_time = std::numeric_limits::max(); int min_config = 0; + ExecutionReturn ret; #define IPK(CONFIG) \ if (config == CONFIG || AutoTuner::ProfileKernels) { \ @@ -830,22 +828,26 @@ class Ipk2Reo3D : public AutoTuner { TaskType task = \ GenTask(nr, nc, nf, am, bm, dist_c, v, queue_idx); \ DeviceAdapter adapter; \ - ExecutionReturn ret = adapter.Execute(task); \ + ret = adapter.Execute(task); \ if (AutoTuner::ProfileKernels) { \ - if (min_time > ret.execution_time) { \ + if (ret.success && min_time > ret.execution_time) { \ min_time = ret.execution_time; \ min_config = CONFIG; \ } \ } \ } - IPK(0) - IPK(1) - IPK(2) - IPK(3) - IPK(4) - IPK(5) - IPK(6) + IPK(6) if (!ret.success) config--; + IPK(5) if (!ret.success) config--; + IPK(4) if (!ret.success) config--; + IPK(3) if (!ret.success) config--; + IPK(2) if (!ret.success) config--; + IPK(1) if (!ret.success) config--; + IPK(0) if (!ret.success) config--; + if (config < 0 && !ret.success) { + std::cout << log::log_err << "no suitable config for Ipk2Reo3D.\n"; + exit(-1); + } #undef IPK if (AutoTuner::ProfileKernels) { @@ -1219,14 +1221,11 @@ class Ipk3Reo3D : public AutoTuner { SubArray<1, T, DeviceType> bm, SubArray<1, T, DeviceType> dist_r, SubArray v, int queue_idx) { int range_l = std::min(6, (int)std::log2(nf) - 1); - int arch = DeviceRuntime::GetArchitectureGeneration(); int prec = TypeToIdx(); - // int config = - // AutoTuner::autoTuningTable.auto_tuning_ts3[arch][prec][range_l]; int config = AutoTuner::autoTuningTable.ipk3_3d[prec][range_l]; - double min_time = std::numeric_limits::max(); int min_config = 0; + ExecutionReturn ret; #define IPK(CONFIG) \ if (config == CONFIG || AutoTuner::ProfileKernels) { \ @@ -1239,22 +1238,26 @@ class Ipk3Reo3D : public AutoTuner { TaskType task = \ GenTask(nr, nc, nf, am, bm, dist_r, v, queue_idx); \ DeviceAdapter adapter; \ - ExecutionReturn ret = adapter.Execute(task); \ + ret = adapter.Execute(task); \ if (AutoTuner::ProfileKernels) { \ - if (min_time > ret.execution_time) { \ + if (ret.success && min_time > ret.execution_time) { \ min_time = ret.execution_time; \ min_config = CONFIG; \ } \ } \ } - IPK(0) - IPK(1) - IPK(2) - IPK(3) - IPK(4) - IPK(5) - IPK(6) + IPK(6) if (!ret.success) config--; + IPK(5) if (!ret.success) config--; + IPK(4) if (!ret.success) config--; + IPK(3) if (!ret.success) config--; + IPK(2) if (!ret.success) config--; + IPK(1) if (!ret.success) config--; + IPK(0) if (!ret.success) config--; + if (config < 0 && !ret.success) { + std::cout << log::log_err << "no suitable config for Ipk3Reo3D.\n"; + exit(-1); + } #undef IPK if (AutoTuner::ProfileKernels) { @@ -1263,1162 +1266,6 @@ class Ipk3Reo3D : public AutoTuner { } }; -// template -// __global__ void _ipk_1_3d(SIZE nr, SIZE nc, SIZE nf_c, T *am, T *bm, T -// *dist_f, -// T *v, SIZE ldv1, SIZE ldv2) { - -// SIZE c_gl = blockIdx.x * C; -// SIZE r_gl = blockIdx.y * R; -// SIZE f_gl = threadIdx.x; - -// SIZE c_sm = threadIdx.x; -// SIZE r_sm = threadIdx.y; -// SIZE f_sm = threadIdx.x; - -// T *vec = v + get_idx(ldv1, ldv2, r_gl, c_gl, 0); -// T *sm = SharedMemory(); -// SIZE ldsm1 = F + G; -// SIZE ldsm2 = C; -// T *vec_sm = sm; -// T *am_sm = sm + R * ldsm1 * ldsm2; -// T *bm_sm = am_sm + ldsm1; - -// T prev_vec_sm = 0.0; - -// SIZE c_rest = min(C, nc - blockIdx.x * C); -// SIZE r_rest = min(R, nr - blockIdx.y * R); - -// // printf("r_rest: %u, c_rest: %u\n", r_rest, c_rest); -// // printf("RCF: %u %u %u\n", R,C,F); -// // printf("n: %u %u %u\n", nr, nc, nf_c); - -// SIZE f_rest = nf_c; -// SIZE f_ghost = min(nf_c, G); -// // printf("G%u, f_ghost:%u\n ", G, f_ghost); -// SIZE f_main = F; - -// /* Load first ghost */ -// if (r_sm < r_rest && f_sm < f_ghost) { -// for (SIZE i = 0; i < c_rest; i++) { -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] = -// vec[get_idx(ldv1, ldv2, r_sm, i, f_gl)]; -// } -// if (r_sm == 0) { -// am_sm[f_sm] = am[f_gl]; -// bm_sm[f_sm] = bm[f_gl]; -// // printf("am[%u]: %f, bm[%u]: %f\n", f_sm, f_sm, am_sm[f_sm], -// bm_sm[f_sm]); -// } -// } - -// f_rest -= f_ghost; -// __syncthreads(); - -// while (f_rest > F - f_ghost) { -// f_main = min(F, f_rest); -// if (r_sm < r_rest && f_sm < f_main) { -// for (SIZE i = 0; i < c_rest; i++) { -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm + f_ghost)] = -// vec[get_idx(ldv1, ldv2, r_sm, i, f_gl + f_ghost)]; -// } -// if (r_sm == 0) { -// am_sm[f_sm + f_ghost] = am[f_gl + f_ghost]; -// bm_sm[f_sm + f_ghost] = bm[f_gl + f_ghost]; -// } -// } - -// __syncthreads(); - -// /* Computation of v in parallel*/ -// if (r_sm < r_rest && c_sm < c_rest) { -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] = tridiag_forward2( -// prev_vec_sm, am_sm[0], bm_sm[0], vec_sm[get_idx(ldsm1, ldsm2, r_sm, -// c_sm, 0)]); - -// //#pragma unroll 32 -// for (SIZE i = 1; i < F; i++) { -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)] = tridiag_forward2( -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i - 1)], am_sm[i], -// bm_sm[i], vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)]); -// } - -// /* Store last v */ -// prev_vec_sm = vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, F - 1)]; -// } -// __syncthreads(); - -// /* flush results to v */ -// if (r_sm < r_rest && f_sm < F) { -// for (SIZE i = 0; i < c_rest; i++) { -// vec[get_idx(ldv1, ldv2, r_sm, i, f_gl)] = -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)]; -// } -// } -// __syncthreads(); - -// /* Update unloaded col */ -// f_rest -= f_main; - -// /* Advance c */ -// f_gl += F; - -// /* Copy next ghost to main */ -// f_ghost = min(G, f_main - (F - G)); -// if (r_sm < r_rest && f_sm < f_ghost) { -// for (SIZE i = 0; i < c_rest; i++) { -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] = -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm + F)]; -// } -// if (r_sm == 0) { -// am_sm[f_sm] = am_sm[f_sm + blockDim.x]; -// bm_sm[f_sm] = bm_sm[f_sm + blockDim.x]; -// } -// } -// __syncthreads(); -// } // end of while - -// /* Load all rest col */ -// if (r_sm < r_rest && f_sm < f_rest) { -// for (SIZE i = 0; i < c_rest; i++) { -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm + f_ghost)] = -// vec[get_idx(ldv1, ldv2, r_sm, i, f_gl + f_ghost)]; -// } -// if (r_sm == 0) { -// am_sm[f_sm + f_ghost] = am[f_gl + f_ghost]; -// bm_sm[f_sm + f_ghost] = bm[f_gl + f_ghost]; -// } -// } - -// __syncthreads(); - -// /* Only 1 col remain */ -// if (f_ghost + f_rest == 1) { -// if (r_sm < r_rest && c_sm < c_rest) { -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] = tridiag_forward2( -// prev_vec_sm, am_sm[0], bm_sm[0], vec_sm[get_idx(ldsm1, ldsm2, r_sm, -// c_sm, 0)]); -// } -// //__syncthreads(); - -// } else { -// if (r_sm < r_rest && c_sm < c_rest) { -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] = tridiag_forward2( -// prev_vec_sm, am_sm[0], bm_sm[0], vec_sm[get_idx(ldsm1, ldsm2, r_sm, -// c_sm, 0)]); -// for (SIZE i = 1; i < f_ghost + f_rest; i++) { -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)] = tridiag_forward2( -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i - 1)], am_sm[i], -// bm_sm[i], vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)]); -// } -// } -// } -// __syncthreads(); -// /* flush results to v */ -// if (r_sm < r_rest && f_sm < f_ghost + f_rest) { -// for (SIZE i = 0; i < c_rest; i++) { -// vec[get_idx(ldv1, ldv2, r_sm, i, f_gl)] = -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)]; -// } -// } -// __syncthreads(); - -// /* backward */ -// f_rest = nf_c; -// f_ghost = min(nf_c, G); -// f_main = F; -// f_gl = threadIdx.x; -// prev_vec_sm = 0.0; - -// /* Load first ghost */ -// if (r_sm < r_rest && f_sm < f_ghost) { -// for (SIZE i = 0; i < c_rest; i++) { -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] = -// vec[get_idx(ldv1, ldv2, r_sm, i, (nf_c - 1) - f_gl)]; -// } -// } -// if (r_sm == 0 && f_gl <= nf_c) { -// am_sm[f_sm] = am[nf_c - f_gl]; -// bm_sm[f_sm] = bm[nf_c - f_gl]; // * -1; -// } -// f_rest -= f_ghost; -// __syncthreads(); - -// while (f_rest > F - f_ghost) { -// f_main = min(F, f_rest); -// if (r_sm < r_rest && f_sm < f_main) { -// for (SIZE i = 0; i < c_rest; i++) { -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm + f_ghost)] = -// vec[get_idx(ldv1, ldv2, r_sm, i, (nf_c - 1) - f_gl - f_ghost)]; -// } -// } -// if (r_sm == 0 && f_gl + f_ghost <= nf_c) { -// am_sm[f_sm + f_ghost] = am[nf_c - f_gl - f_ghost]; -// bm_sm[f_sm + f_ghost] = bm[nf_c - f_gl - f_ghost]; // * -1; -// } -// __syncthreads(); - -// /* Computation of v in parallel*/ -// if (r_sm < r_rest && c_sm < c_rest) { -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] = -// tridiag_backward2(prev_vec_sm, am_sm[0], bm_sm[0], -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)]); -// //#pragma unroll 32 -// for (SIZE i = 1; i < F; i++) { -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)] = tridiag_backward2( -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i - 1)], -// am_sm[i], bm_sm[i], vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, -// i)]); -// } -// /* Store last v */ -// prev_vec_sm = vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, blockDim.x - -// 1)]; -// } -// __syncthreads(); - -// /* flush results to v */ -// if (r_sm < r_rest && f_sm < F) { -// for (SIZE i = 0; i < c_rest; i++) { -// vec[get_idx(ldv1, ldv2, r_sm, i, (nf_c - 1) - f_gl)] = -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)]; -// } -// } -// __syncthreads(); - -// /* Update unloaded col */ -// f_rest -= f_main; - -// /* Advance c */ -// f_gl += F; - -// /* Copy next ghost to main */ -// f_ghost = min(G, f_main - (F - G)); -// if (r_sm < r_rest && f_sm < f_ghost) { -// for (SIZE i = 0; i < c_rest; i++) { -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] = -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm + F)]; -// } -// if (r_sm == 0) { -// am_sm[f_sm] = am_sm[f_sm + F]; -// bm_sm[f_sm] = bm_sm[f_sm + F]; -// } -// } -// __syncthreads(); -// } // end of while - -// /* Load all rest col */ -// if (r_sm < r_rest && f_sm < f_rest) { -// for (SIZE i = 0; i < c_rest; i++) { -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm + f_ghost)] = -// vec[get_idx(ldv1, ldv2, r_sm, i, (nf_c - 1) - f_gl - f_ghost)]; -// } -// } -// if (r_sm == 0 && f_gl + f_ghost <= nf_c) { -// am_sm[f_sm + f_ghost] = am[nf_c - f_gl - f_ghost]; -// bm_sm[f_sm + f_ghost] = bm[nf_c - f_gl - f_ghost]; -// // printf("%u %u\n", f_gl, f_ghost); -// } -// __syncthreads(); - -// /* Only 1 col remain */ -// if (f_ghost + f_rest == 1) { -// if (r_sm < r_rest && c_sm < c_rest) { -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] = -// tridiag_backward2(prev_vec_sm, am_sm[0], bm_sm[0], -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)]); -// } -// //__syncthreads(); - -// } else { -// if (r_sm < r_rest && c_sm < c_rest) { -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] = -// tridiag_backward2(prev_vec_sm, am_sm[0], bm_sm[0], -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)]); -// for (SIZE i = 1; i < f_ghost + f_rest; i++) { -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)] = tridiag_backward2( -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i - 1)], -// am_sm[i], bm_sm[i], vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, -// i)]); -// } -// } -// } -// __syncthreads(); -// /* flush results to v */ -// if (r_sm < r_rest && f_sm < f_ghost + f_rest) { -// for (SIZE i = 0; i < c_rest; i++) { -// vec[get_idx(ldv1, ldv2, r_sm, i, (nf_c - 1) - f_gl)] = -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)]; -// } -// } -// __syncthreads(); -// } - -// template -// void ipk_1_3d_adaptive_launcher(Handle &handle, SIZE nr, SIZE nc, SIZE -// nf_c, -// T *am, T *bm, T *ddist_f, T *dv, SIZE lddv1, -// SIZE lddv2, int queue_idx) { -// // std::cout << "test\n"; - -// SIZE total_thread_x = nc; -// SIZE total_thread_y = nr; -// SIZE total_thread_z = 1; -// SIZE tbx, tby, tbz, gridx, gridy, gridz; -// dim3 threadsPerBlock, blockPerGrid; -// SIZE sm_size; - -// tbx = C;//std::max(C, std::min(C, total_thread_x)); -// tby = R;//std::max(R, std::min(R, total_thread_y)); -// tbz = 1; -// sm_size = (R * C + 2) * (F + G) * sizeof(T); -// gridx = ceil((float)total_thread_x / tbx); -// gridy = ceil((float)total_thread_y / tby); -// gridz = 1; -// threadsPerBlock = dim3(F, tby, tbz); -// blockPerGrid = dim3(gridx, gridy, gridz); - -// _ipk_1_3d<<>>( -// nr, nc, nf_c, am, bm, ddist_f, dv, lddv1, lddv2); -// gpuErrchk(cudaGetLastError()); -// if (handle.sync_and_check_all_kernels) { -// gpuErrchk(cudaDeviceSynchronize()); -// } -// // std::cout << "test\n"; -// } - -// template -// void ipk_1_3d(Handle &handle, SIZE nr, SIZE nc, SIZE nf_c, T *am, T -// *bm, -// T *ddist_f, T *dv, SIZE lddv1, SIZE lddv2, int queue_idx, -// int config) { - -// #define IPK(R, C, F, G) \ -// { \ -// ipk_1_3d_adaptive_launcher( \ -// handle, nr, nc, nf_c, am, bm, ddist_f, dv, lddv1, lddv2, queue_idx); \ -// } -// bool profile = false; -// if (handle.profile_kernels) { -// profile = true; -// } -// if (D == 3) { -// if (profile || config == 6) { -// IPK(2, 2, 128, 2) -// } -// if (profile || config == 5) { -// IPK(2, 2, 64, 2) -// } -// if (profile || config == 4) { -// IPK(2, 2, 32, 2) -// } -// if (profile || config == 3) { -// IPK(4, 4, 16, 4) -// } -// if (profile || config == 2) { -// IPK(8, 8, 8, 4) -// } -// if (profile || config == 1) { -// IPK(4, 4, 4, 4) -// } -// if (profile || config == 0) { -// IPK(2, 2, 2, 2) -// } -// } else if (D == 2) { -// if (profile || config == 6) { -// IPK(1, 2, 128, 2) -// } -// if (profile || config == 5) { -// IPK(1, 2, 64, 2) -// } -// if (profile || config == 4) { -// IPK(1, 2, 32, 2) -// } -// if (profile || config == 3) { -// IPK(1, 4, 16, 4) -// } -// if (profile || config == 2) { -// IPK(1, 8, 8, 4) -// } -// if (profile || config == 1) { -// IPK(1, 4, 4, 4) -// } -// if (profile || config == 0) { -// IPK(1, 2, 4, 2) -// } -// } else if (D == 1) { -// if (profile || config == 6) { -// IPK(1, 1, 128, 2) -// } -// if (profile || config == 5) { -// IPK(1, 1, 64, 2) -// } -// if (profile || config == 4) { -// IPK(1, 1, 32, 2) -// } -// if (profile || config == 3) { -// IPK(1, 1, 16, 4) -// } -// if (profile || config == 2) { -// IPK(1, 1, 8, 4) -// } -// if (profile || config == 1) { -// IPK(1, 1, 8, 4) -// } -// if (profile || config == 0) { -// IPK(1, 1, 8, 2) -// } -// } -// #undef IPK -// } - -// template -// __global__ void _ipk_2_3d(SIZE nr, SIZE nc_c, SIZE nf_c, T *am, T *bm, T -// *dist_c, -// T *v, SIZE ldv1, SIZE ldv2) { - -// SIZE f_gl = blockIdx.x * F; -// SIZE r_gl = blockIdx.y * R; -// SIZE c_gl = 0; - -// SIZE f_sm = threadIdx.x; -// SIZE r_sm = threadIdx.y; -// SIZE c_sm = threadIdx.x; - -// T *vec = v + get_idx(ldv1, ldv2, r_gl, 0, f_gl); -// T *sm = SharedMemory(); -// SIZE ldsm1 = F; -// SIZE ldsm2 = C + G; -// T *vec_sm = sm; -// T *am_sm = sm + R * ldsm1 * ldsm2; -// T *bm_sm = am_sm + ldsm2; - -// T prev_vec_sm = 0.0; - -// SIZE f_rest = min(F, nf_c - blockIdx.x * F); -// SIZE r_rest = min(R, nr - blockIdx.y * R); - -// // if (blockIdx.x == 1 && blockIdx.y == 0 && f_sm == 0 && r_sm == 0) { -// // prSIZEf("f_rest: %d r_rest: %d\n", f_rest, r_rest); -// // } - -// SIZE c_rest = nc_c; -// SIZE c_ghost = min(nc_c, G); -// SIZE c_main = C; - -// /* Load first ghost */ -// if (r_sm < r_rest && f_sm < f_rest) { -// for (SIZE i = 0; i < c_ghost; i++) { -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] = -// vec[get_idx(ldv1, ldv2, r_sm, c_gl + i, f_sm)]; -// // if (r_sm == 0) printf("r0_stride = %d, vec_sm[%d] = %f\n", -// r0_stride, -// // i, vec_sm[i * ldsm + c_sm]); -// } -// } -// if (r_sm == 0 && c_sm < c_ghost) { -// am_sm[c_sm] = am[c_gl + c_sm]; -// bm_sm[c_sm] = bm[c_gl + c_sm]; -// } -// c_rest -= c_ghost; -// __syncthreads(); - -// while (c_rest > C - c_ghost) { -// // printf("%d %d %d\n", c_rest, C, c_ghost); -// c_main = min(C, c_rest); -// if (r_sm < r_rest && f_sm < f_rest) { -// for (SIZE i = 0; i < c_main; i++) { -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, i + c_ghost, f_sm)] = -// vec[get_idx(ldv1, ldv2, r_sm, c_gl + i + c_ghost, f_sm)]; -// } -// } -// if (r_sm == 0 && c_sm < c_main){ -// am_sm[c_sm + c_ghost] = am[c_gl + c_sm + c_ghost]; -// bm_sm[c_sm + c_ghost] = bm[c_gl + c_sm + c_ghost]; -// } -// __syncthreads(); - -// /* Computation of v in parallel*/ -// if (r_sm < r_rest && f_sm < f_rest) { -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)] = tridiag_forward2( -// prev_vec_sm, am_sm[0], bm_sm[0], vec_sm[get_idx(ldsm1, ldsm2, r_sm, -// 0, f_sm)]); - -// for (SIZE i = 1; i < C; i++) { -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] = tridiag_forward2( -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, i - 1, f_sm)], am_sm[i], -// bm_sm[i], vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)]); -// } -// /* Store last v */ -// prev_vec_sm = vec_sm[get_idx(ldsm1, ldsm2, r_sm, C - 1, f_sm)]; -// } -// __syncthreads(); - -// /* flush results to v */ -// if (r_sm < r_rest && f_sm < f_rest) { -// for (SIZE i = 0; i < C; i++) { -// vec[get_idx(ldv1, ldv2, r_sm, c_gl + i, f_sm)] = -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)]; -// } -// } -// __syncthreads(); - -// /* Update unloaded col */ -// c_rest -= c_main; - -// /* Advance c */ -// c_gl += C; - -// /* Copy next ghost to main */ -// c_ghost = min(G, c_main - (C - G)); -// if (r_sm < r_rest && f_sm < f_rest) { -// for (SIZE i = 0; i < c_ghost; i++) { -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] = -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, i + C, f_sm)]; -// } -// } -// if (r_sm == 0 && c_sm < c_ghost) { -// am_sm[c_sm] = am_sm[c_sm + C]; -// bm_sm[c_sm] = bm_sm[c_sm + C]; -// } -// __syncthreads(); - -// } // end of while - -// /* Load all rest col */ -// if (r_sm < r_rest && f_sm < f_rest) { -// for (SIZE i = 0; i < c_rest; i++) { -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, i + c_ghost, f_sm)] = -// vec[get_idx(ldv1, ldv2, r_sm, c_gl + i + c_ghost, f_sm)]; -// } -// } -// if (r_sm == 0 && c_sm < c_rest) { -// am_sm[c_sm + c_ghost] = am[c_gl + c_sm + c_ghost]; -// bm_sm[c_sm + c_ghost] = bm[c_gl + c_sm + c_ghost]; -// } -// __syncthreads(); - -// /* Only 1 col remain */ -// if (c_ghost + c_rest == 1) { -// if (r_sm < r_rest && f_sm < f_rest) { -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)] = tridiag_forward2( -// prev_vec_sm, am_sm[0], bm_sm[0], vec_sm[get_idx(ldsm1, ldsm2, r_sm, -// 0, f_sm)]); -// } -// //__syncthreads(); - -// } else { -// if (r_sm < r_rest && f_sm < f_rest) { -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)] = tridiag_forward2( -// prev_vec_sm, am_sm[0], bm_sm[0], vec_sm[get_idx(ldsm1, ldsm2, r_sm, -// 0, f_sm)]); -// for (SIZE i = 1; i < c_ghost + c_rest; i++) { -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] = tridiag_forward2( -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, i - 1, f_sm)], am_sm[i], -// bm_sm[i], vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)]); -// } -// } -// } -// __syncthreads(); -// /* flush results to v */ -// if (r_sm < r_rest && f_sm < f_rest) { -// for (SIZE i = 0; i < c_ghost + c_rest; i++) { -// vec[get_idx(ldv1, ldv2, r_sm, c_gl + i, f_sm)] = -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)]; -// } -// } -// __syncthreads(); - -// /* backward */ -// c_rest = nc_c; -// c_ghost = min(nc_c, G); -// c_main = C; -// c_gl = 0; -// prev_vec_sm = 0.0; - -// /* Load first ghost */ -// if (r_sm < r_rest && f_sm < f_rest) { -// for (SIZE i = 0; i < c_ghost; i++) { -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] = -// vec[get_idx(ldv1, ldv2, r_sm, (nc_c - 1) - (c_gl + i), f_sm)]; -// } -// } -// if (r_sm == 0 && c_sm < c_ghost) { -// am_sm[c_sm] = am[nc_c - (c_gl + c_sm)]; -// bm_sm[c_sm] = bm[nc_c - (c_gl + c_sm)]; -// } -// c_rest -= c_ghost; -// __syncthreads(); - -// while (c_rest > C - c_ghost) { -// // printf("%d %d %d\n", c_rest, C, c_ghost); -// c_main = min(C, c_rest); -// if (r_sm < r_rest && f_sm < f_rest) { -// for (SIZE i = 0; i < c_main; i++) { -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, i + c_ghost, f_sm)] = vec[get_idx( -// ldv1, ldv2, r_sm, (nc_c - 1) - (c_gl + i + c_ghost), f_sm)]; -// } -// } -// if (r_sm == 0 && c_sm < c_main) { -// am_sm[c_sm + c_ghost] = am[nc_c- (c_gl + c_sm + c_ghost)]; -// bm_sm[c_sm + c_ghost] = bm[nc_c- (c_gl + c_sm + c_ghost)]; -// } -// __syncthreads(); - -// /* Computation of v in parallel*/ -// if (r_sm < r_rest && f_sm < f_rest) { -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, c_sm)] = -// tridiag_backward2(prev_vec_sm, am_sm[0], bm_sm[0], -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, c_sm)]); - -// for (SIZE i = 1; i < C; i++) { -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] = tridiag_backward2( -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, i - 1, f_sm)], -// am_sm[i], bm_sm[i], vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, -// f_sm)]); -// } - -// /* Store last v */ -// prev_vec_sm = vec_sm[get_idx(ldsm1, ldsm2, r_sm, C - 1, f_sm)]; -// } -// __syncthreads(); - -// /* flush results to v */ -// if (r_sm < r_rest && f_sm < f_rest) { -// for (SIZE i = 0; i < C; i++) { -// vec[get_idx(ldv1, ldv2, r_sm, (nc_c - 1) - (c_gl + i), f_sm)] = -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)]; -// } -// } -// __syncthreads(); - -// /* Update unloaded col */ -// c_rest -= c_main; - -// /* Advance c */ -// c_gl += C; - -// /* Copy next ghost to main */ -// c_ghost = min(G, c_main - (C - G)); -// if (r_sm < r_rest && f_sm < f_rest) { -// for (SIZE i = 0; i < c_ghost; i++) { -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] = -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, i + C, f_sm)]; -// } -// } -// if (r_sm == 0 && c_sm < c_ghost) { -// am_sm[c_sm] = am_sm[c_sm + C]; -// bm_sm[c_sm] = bm_sm[c_sm + C]; -// } -// __syncthreads(); - -// } // end of while - -// // Load all rest col -// if (r_sm < r_rest && f_sm < f_rest) { -// for (SIZE i = 0; i < c_rest; i++) { -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, i + c_ghost, f_sm)] = vec[get_idx( -// ldv1, ldv2, r_sm, (nc_c - 1) - (c_gl + i + c_ghost), f_sm)]; -// } -// } -// if (r_sm == 0 && c_sm < c_rest) { -// am_sm[c_sm + c_ghost] = am[nc_c - (c_gl + c_sm + c_ghost)]; -// bm_sm[c_sm + c_ghost] = bm[nc_c - (c_gl + c_sm + c_ghost)]; -// } -// __syncthreads(); - -// /* Only 1 col remain */ -// if (c_ghost + c_rest == 1) { -// if (r_sm < r_rest && f_sm < f_rest) { -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, c_sm)] = -// tridiag_backward2(prev_vec_sm, am_sm[0], bm_sm[0], -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, c_sm)]); -// } -// //__syncthreads(); - -// } else { -// if (r_sm < r_rest && f_sm < f_rest) { -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, c_sm)] = -// tridiag_backward2(prev_vec_sm, am_sm[0], bm_sm[0], -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, c_sm)]); -// for (SIZE i = 1; i < c_ghost + c_rest; i++) { -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] = tridiag_backward2( -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, i - 1, f_sm)], -// am_sm[i], bm_sm[i], vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, -// f_sm)]); -// } -// } -// } -// __syncthreads(); -// /* flush results to v */ -// if (r_sm < r_rest && f_sm < f_rest) { -// for (SIZE i = 0; i < c_ghost + c_rest; i++) { -// vec[get_idx(ldv1, ldv2, r_sm, (nc_c - 1) - (c_gl + i), f_sm)] = -// vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)]; -// // printf("c_stride = %d, c_sm = %d, vec_sm = %f, vec[%d] = -// // %f\n",c_stride, c_sm, vec_sm[r_sm * ldsm + 0],i * row_stride * lddv -// + -// // c_stride, vec[i * row_stride * lddv + c_stride]); -// } -// } -// __syncthreads(); -// } - -// template -// void ipk_2_3d_adaptive_launcher(Handle &handle, SIZE nr, SIZE nc_c, -// SIZE nf_c, T *am, T *bm, T *ddist_c, T *dv, -// SIZE lddv1, SIZE lddv2, int queue_idx) { -// SIZE total_thread_x = nf_c; -// SIZE total_thread_y = nr; -// SIZE total_thread_z = 1; -// SIZE tbx, tby, tbz, gridx, gridy, gridz; -// dim3 threadsPerBlock, blockPerGrid; -// size_t sm_size; - -// tbx = F;//std::max(F, std::min(F, total_thread_x)); -// tby = R;//std::max(R, std::min(R, total_thread_y)); -// tbz = 1; -// sm_size = (R * F + 2) * (C + G) * sizeof(T); -// gridx = ceil((float)total_thread_x / tbx); -// gridy = ceil((float)total_thread_y / tby); -// gridz = 1; -// threadsPerBlock = dim3(tbx, tby, tbz); -// blockPerGrid = dim3(gridx, gridy, gridz); -// _ipk_2_3d<<>>( -// nr, nc_c, nf_c, am, bm, ddist_c, dv, lddv1, lddv2); -// gpuErrchk(cudaGetLastError()); -// if (handle.sync_and_check_all_kernels) { -// gpuErrchk(cudaDeviceSynchronize()); -// } -// } - -// template -// void ipk_2_3d(Handle &handle, SIZE nr, SIZE nc_c, SIZE nf_c, T *am, T -// *bm, -// T *ddist_c, T *dv, SIZE lddv1, SIZE lddv2, int queue_idx, -// int config) { - -// #define IPK(R, C, F, G) \ -// { \ -// ipk_2_3d_adaptive_launcher( \ -// handle, nr, nc_c, nf_c, am, bm, ddist_c, dv, lddv1, lddv2, queue_idx); \ -// } -// bool profile = false; -// if (handle.profile_kernels) { -// profile = true; -// } -// if (D == 3) { -// if (profile || config == 6) { -// IPK(2, 2, 128, 2) -// } -// if (profile || config == 5) { -// IPK(2, 2, 64, 2) -// } -// if (profile || config == 4) { -// IPK(2, 2, 32, 2) -// } -// if (profile || config == 3) { -// IPK(4, 4, 16, 4) -// } -// if (profile || config == 2) { -// IPK(8, 8, 8, 4) -// } -// if (profile || config == 1) { -// IPK(4, 4, 4, 4) -// } -// if (profile || config == 0) { -// IPK(2, 2, 2, 2) -// } -// } else if (D == 2) { -// if (profile || config == 6) { -// IPK(1, 2, 128, 2) -// } -// if (profile || config == 5) { -// IPK(1, 2, 64, 2) -// } -// if (profile || config == 4) { -// IPK(1, 2, 32, 2) -// } -// if (profile || config == 3) { -// IPK(1, 4, 16, 4) -// } -// if (profile || config == 2) { -// IPK(1, 8, 8, 4) -// } -// if (profile || config == 1) { -// IPK(1, 4, 4, 4) -// } -// if (profile || config == 0) { -// IPK(1, 2, 4, 2) -// } -// } else { -// printf("Error: ipk_2_3d is only for 3D and 2D data\n"); -// } -// #undef IPK -// } - -// template -// __global__ void _ipk_3_3d(SIZE nr_c, SIZE nc_c, SIZE nf_c, T *am, T *bm, T -// *dist_r, -// T *v, SIZE ldv1, SIZE ldv2) { - -// SIZE f_gl = blockIdx.x * F; -// SIZE c_gl = blockIdx.y * C; -// SIZE r_gl = 0; - -// SIZE f_sm = threadIdx.x; -// SIZE c_sm = threadIdx.y; -// SIZE r_sm = threadIdx.x; - -// T *vec = v + get_idx(ldv1, ldv2, 0, c_gl, f_gl); -// T *sm = SharedMemory(); -// SIZE ldsm1 = F; -// SIZE ldsm2 = C; -// T *vec_sm = sm; -// T *am_sm = sm + (R + G) * ldsm1 * ldsm2; -// T *bm_sm = am_sm + (R + G); - -// T prev_vec_sm = 0.0; - -// SIZE f_rest = min(F, nf_c - blockIdx.x * F); -// SIZE c_rest = min(C, nc_c - blockIdx.y * C); - -// SIZE r_rest = nr_c; -// SIZE r_ghost = min(nr_c, G); -// SIZE r_main = R; - -// /* Load first ghost */ -// if (c_sm < c_rest && f_sm < f_rest) { -// for (SIZE i = 0; i < r_ghost; i++) { -// vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)] = -// vec[get_idx(ldv1, ldv2, r_gl + i, c_sm, f_sm)]; -// } -// } - -// if (c_sm == 0 && r_sm < r_ghost) { -// am_sm[r_sm] = am[r_gl + r_sm]; -// bm_sm[r_sm] = bm[r_gl + r_sm]; -// } -// r_rest -= r_ghost; -// __syncthreads(); - -// while (r_rest > R - r_ghost) { -// r_main = min(R, r_rest); -// if (c_sm < c_rest && f_sm < f_rest) { -// for (SIZE i = 0; i < r_main; i++) { -// vec_sm[get_idx(ldsm1, ldsm2, i + r_ghost, c_sm, f_sm)] = -// vec[get_idx(ldv1, ldv2, r_gl + i + r_ghost, c_sm, f_sm)]; -// } -// } -// if (c_sm == 0 && r_sm < r_main) { -// am_sm[r_sm + r_ghost] = am[r_gl + r_sm + r_ghost]; -// bm_sm[r_sm + r_ghost] = bm[r_gl + r_sm + r_ghost]; -// } -// __syncthreads(); - -// /* Computation of v in parallel*/ -// if (c_sm < c_rest && f_sm < f_rest) { -// vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] = tridiag_forward2( -// prev_vec_sm, am_sm[0], bm_sm[0], vec_sm[get_idx(ldsm1, ldsm2, 0, -// c_sm, f_sm)]); -// for (SIZE i = 1; i < R; i++) { -// vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)] = tridiag_forward2( -// vec_sm[get_idx(ldsm1, ldsm2, i - 1, c_sm, f_sm)], am_sm[i], -// bm_sm[i], vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)]); -// } - -// /* Store last v */ -// prev_vec_sm = vec_sm[get_idx(ldsm1, ldsm2, R - 1, c_sm, f_sm)]; -// } -// __syncthreads(); - -// /* flush results to v */ -// if (c_sm < c_rest && f_sm < f_rest) { -// for (SIZE i = 0; i < R; i++) { -// vec[get_idx(ldv1, ldv2, r_gl + i, c_sm, f_sm)] = -// vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)]; -// } -// } -// __syncthreads(); - -// // /* Update unloaded col */ -// r_rest -= r_main; - -// /* Advance c */ -// r_gl += R; - -// /* Copy next ghost to main */ -// r_ghost = min(G, r_main - (R - G)); -// if (c_sm < c_rest && f_sm < f_rest) { -// for (SIZE i = 0; i < r_ghost; i++) { -// vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)] = -// vec_sm[get_idx(ldsm1, ldsm2, i + R, c_sm, f_sm)]; -// } -// } -// if (c_sm == 0 && r_sm < r_ghost) { -// am_sm[r_sm] = am_sm[r_sm + R]; -// bm_sm[r_sm] = bm_sm[r_sm + R]; -// } -// __syncthreads(); - -// } // end of while - -// /* Load all rest col */ -// if (c_sm < c_rest && f_sm < f_rest) { -// for (SIZE i = 0; i < r_rest; i++) { -// vec_sm[get_idx(ldsm1, ldsm2, i + r_ghost, c_sm, f_sm)] = -// vec[get_idx(ldv1, ldv2, r_gl + i + r_ghost, c_sm, f_sm)]; -// } -// } - -// if (c_sm == 0 && r_sm < r_rest) { -// am_sm[r_sm + r_ghost] = am[r_gl + r_sm + r_ghost]; -// bm_sm[r_sm + r_ghost] = bm[r_gl + r_sm + r_ghost]; -// } -// __syncthreads(); - -// /* Only 1 col remain */ -// if (r_ghost + r_rest == 1) { -// if (c_sm < c_rest && f_sm < f_rest) { -// vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] = tridiag_forward2( -// prev_vec_sm, am_sm[0], bm_sm[0], vec_sm[get_idx(ldsm1, ldsm2, 0, -// c_sm, f_sm)]); -// } -// //__syncthreads(); - -// } else { -// if (c_sm < c_rest && f_sm < f_rest) { -// vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] = tridiag_forward2( -// prev_vec_sm, am_sm[0], bm_sm[0], vec_sm[get_idx(ldsm1, ldsm2, 0, -// c_sm, f_sm)]); -// for (SIZE i = 1; i < r_ghost + r_rest; i++) { -// vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)] = tridiag_forward2( -// vec_sm[get_idx(ldsm1, ldsm2, i - 1, c_sm, f_sm)], am_sm[i], -// bm_sm[i], vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)]); -// } -// } -// } -// __syncthreads(); -// /* flush results to v */ -// if (c_sm < c_rest && f_sm < f_rest) { -// for (SIZE i = 0; i < r_ghost + r_rest; i++) { -// vec[get_idx(ldv1, ldv2, r_gl + i, c_sm, f_sm)] = -// vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)]; -// } -// } -// __syncthreads(); - -// /* backward */ -// r_rest = nr_c; -// r_ghost = min(nr_c, G); -// r_main = R; -// r_gl = 0; -// prev_vec_sm = 0.0; - -// /* Load first ghost */ -// if (c_sm < c_rest && f_sm < f_rest) { -// for (SIZE i = 0; i < r_ghost; i++) { -// vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)] = -// vec[get_idx(ldv1, ldv2, (nr_c - 1) - (r_gl + i), c_sm, f_sm)]; -// } -// } - -// if (c_sm == 0 && r_sm < r_ghost) { -// am_sm[r_sm] = am[nr_c - (r_gl + r_sm)]; -// bm_sm[r_sm] = bm[nr_c - (r_gl + r_sm)]; -// } -// r_rest -= r_ghost; -// __syncthreads(); - -// while (r_rest > R - r_ghost) { -// r_main = min(R, r_rest); -// if (c_sm < c_rest && f_sm < f_rest) { -// for (SIZE i = 0; i < r_main; i++) { -// vec_sm[get_idx(ldsm1, ldsm2, i + r_ghost, c_sm, f_sm)] = vec[get_idx( -// ldv1, ldv2, (nr_c - 1) - (r_gl + i + r_ghost), c_sm, f_sm)]; -// } -// } -// if (c_sm == 0 && r_sm < r_main) { -// am_sm[r_sm + r_ghost] = am[nr_c - (r_gl + r_sm + r_ghost)]; -// bm_sm[r_sm + r_ghost] = bm[nr_c - (r_gl + r_sm + r_ghost)]; -// } -// __syncthreads(); - -// /* Computation of v in parallel*/ -// if (c_sm < c_rest && f_sm < f_rest) { -// vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] = -// tridiag_backward2(prev_vec_sm, am_sm[0], bm_sm[0], -// vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)]); -// for (SIZE i = 1; i < R; i++) { -// vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)] = tridiag_backward2( -// vec_sm[get_idx(ldsm1, ldsm2, i - 1, c_sm, f_sm)], -// am_sm[i], bm_sm[i], vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, -// f_sm)]); -// } - -// /* Store last v */ -// prev_vec_sm = vec_sm[get_idx(ldsm1, ldsm2, R - 1, c_sm, f_sm)]; -// } -// __syncthreads(); - -// /* flush results to v */ -// if (c_sm < c_rest && f_sm < f_rest) { -// for (SIZE i = 0; i < R; i++) { -// vec[get_idx(ldv1, ldv2, (nr_c - 1) - (r_gl + i), c_sm, f_sm)] = -// vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)]; -// } -// } -// __syncthreads(); - -// // /* Update unloaded col */ -// r_rest -= r_main; - -// /* Advance c */ -// r_gl += R; - -// /* Copy next ghost to main */ -// r_ghost = min(G, r_main - (R - G)); -// if (c_sm < c_rest && f_sm < f_rest) { -// for (SIZE i = 0; i < r_ghost; i++) { -// vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)] = -// vec_sm[get_idx(ldsm1, ldsm2, i + R, c_sm, f_sm)]; -// } -// } -// if (c_sm == 0 && r_sm < r_ghost) { -// am_sm[r_sm] = am_sm[r_sm + R]; -// bm_sm[r_sm] = bm_sm[r_sm + R]; -// } -// __syncthreads(); - -// } // end of while - -// /* Load all rest col */ -// if (c_sm < c_rest && f_sm < f_rest) { -// for (SIZE i = 0; i < r_rest; i++) { -// vec_sm[get_idx(ldsm1, ldsm2, i + r_ghost, c_sm, f_sm)] = vec[get_idx( -// ldv1, ldv2, (nr_c - 1) - (r_gl + i + r_ghost), c_sm, f_sm)]; -// } -// } -// if (c_sm == 0 && r_sm < r_rest) { -// am_sm[r_sm + r_ghost] = am[nr_c - (r_gl + r_sm + r_ghost)]; -// bm_sm[r_sm + r_ghost] = bm[nr_c - (r_gl + r_sm + r_ghost)]; -// } -// __syncthreads(); - -// /* Only 1 col remain */ -// if (r_ghost + r_rest == 1) { -// if (c_sm < c_rest && f_sm < f_rest) { -// vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] = -// tridiag_backward2(prev_vec_sm, am_sm[0], bm_sm[0], -// vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)]); -// } -// //__syncthreads(); - -// } else { -// if (c_sm < c_rest && f_sm < f_rest) { -// vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] = -// tridiag_backward2(prev_vec_sm, am_sm[0], bm_sm[0], -// vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)]); -// for (SIZE i = 1; i < r_ghost + r_rest; i++) { -// vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)] = tridiag_backward2( -// vec_sm[get_idx(ldsm1, ldsm2, i - 1, c_sm, f_sm)], -// am_sm[i], bm_sm[i], vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, -// f_sm)]); -// } -// } -// } -// __syncthreads(); -// /* flush results to v */ -// if (c_sm < c_rest && f_sm < f_rest) { -// for (SIZE i = 0; i < r_ghost + r_rest; i++) { -// vec[get_idx(ldv1, ldv2, (nr_c - 1) - (r_gl + i), c_sm, f_sm)] = -// vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)]; -// } -// } -// __syncthreads(); -// } - -// template -// void ipk_3_3d_adaptive_launcher(Handle &handle, SIZE nr_c, SIZE nc_c, -// SIZE nf_c, T *am, T *bm, T *ddist_r, T *dv, -// SIZE lddv1, SIZE lddv2, int queue_idx) { - -// // printf("am: "); -// // print_matrix_cuda(1, nr, am, nr); -// // printf("bm: "); -// // print_matrix_cuda(1, nr, bm, nr); - -// SIZE total_thread_x = nf_c; -// SIZE total_thread_y = nc_c; -// SIZE total_thread_z = 1; -// SIZE tbx, tby, tbz, gridx, gridy, gridz; -// dim3 threadsPerBlock, blockPerGrid; -// size_t sm_size; - -// tbx = F;//std::max(F, std::min(F, total_thread_x)); -// tby = C;//std::max(C, std::min(C, total_thread_y)); -// tbz = 1; -// sm_size = (C * F + 2) * (R + G) * sizeof(T); -// gridx = ceil((float)total_thread_x / tbx); -// gridy = ceil((float)total_thread_y / tby); -// gridz = 1; -// threadsPerBlock = dim3(tbx, tby, tbz); -// blockPerGrid = dim3(gridx, gridy, gridz); -// _ipk_3_3d<<>>( -// nr_c, nc_c, nf_c, am, bm, ddist_r, dv, lddv1, lddv2); -// gpuErrchk(cudaGetLastError()); -// if (handle.sync_and_check_all_kernels) { -// gpuErrchk(cudaDeviceSynchronize()); -// } -// } - -// template -// void ipk_3_3d(Handle &handle, SIZE nr_c, SIZE nc_c, SIZE nf_c, T *am, T -// *bm, -// T *ddist_r, T *dv, SIZE lddv1, SIZE lddv2, int queue_idx, -// int config) { - -// #define IPK(R, C, F, G) \ -// { \ -// ipk_3_3d_adaptive_launcher(handle, nr_c, nc_c, nf_c, am, \ -// bm, ddist_r, dv, lddv1, \ -// lddv2, queue_idx); \ -// } - -// bool profile = false; -// if (handle.profile_kernels) { -// profile = true; -// } -// if (D == 3) { -// if (profile || config == 6) { -// IPK(2, 2, 128, 2) -// } -// if (profile || config == 5) { -// IPK(2, 2, 64, 2) -// } -// if (profile || config == 4) { -// IPK(2, 2, 32, 2) -// } -// if (profile || config == 3) { -// IPK(2, 2, 16, 2) -// } -// if (profile || config == 2) { -// IPK(8, 8, 8, 4) -// } -// if (profile || config == 1) { -// IPK(4, 4, 4, 4) -// } -// if (profile || config == 0) { -// IPK(2, 2, 2, 2) -// } -// } else { -// printf("Error: ipk_3_3d is only for 3D data\n"); -// } -// #undef IPK -// } - } // namespace mgard_x #endif \ No newline at end of file diff --git a/include/mgard-x/DataRefactoring/MultiDimension/Correction/IterativeProcessingKernel3D_AMR.h b/include/mgard-x/DataRefactoring/MultiDimension/Correction/IterativeProcessingKernel3D_AMR.h deleted file mode 100644 index 2db0cb8afa..0000000000 --- a/include/mgard-x/DataRefactoring/MultiDimension/Correction/IterativeProcessingKernel3D_AMR.h +++ /dev/null @@ -1,35 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#ifndef MGARD_X_ITERATIVE_PROCESSING_KERNEL_3D_AMR -#define MGARD_X_ITERATIVE_PROCESSING_KERNEL_3D_AMR - -#include "../../Common.h" -#include "../../CommonInternal.h" - -namespace mgard_x { - -template -void ipk_1_3d_amr(Handle &handle, int nr, int nc, int nf_c, T *am, T *bm, - T *ddist_f, T *dv, int lddv1, int lddv2, bool retrieve, - int block_size, T *fv, int ldfv1, int ldfv2, T *bv, int ldbv1, - int ldbv2, int queue_idx, int config); - -// template -// void ipk_2_3d(Handle &handle, int nr, int nc_c, int nf_c, T *am, T *bm, -// T *ddist_c, T *dv, int lddv1, int lddv2, int queue_idx, -// int config); - -// template -// void ipk_3_3d(Handle &handle, int nr_c, int nc_c, int nf_c, T *am, T -// *bm, -// T *ddist_r, T *dv, int lddv1, int lddv2, int queue_idx, -// int config); - -} // namespace mgard_x - -#endif \ No newline at end of file diff --git a/include/mgard-x/DataRefactoring/MultiDimension/Correction/IterativeProcessingKernel3D_AMR.hpp b/include/mgard-x/DataRefactoring/MultiDimension/Correction/IterativeProcessingKernel3D_AMR.hpp deleted file mode 100644 index 1edf652e08..0000000000 --- a/include/mgard-x/DataRefactoring/MultiDimension/Correction/IterativeProcessingKernel3D_AMR.hpp +++ /dev/null @@ -1,1693 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#ifndef MGARD_X_ITERATIVE_PROCESSING_KERNEL_3D_AMR_TEMPLATE -#define MGARD_X_ITERATIVE_PROCESSING_KERNEL_3D_AMR_TEMPLATE - -#include "../../IPKFunctor.h" -#include "../../IterativeProcessingKernel3D_AMR.h" -namespace mgard_x { - -// fv has shape nc (lead dim.) * nr * nf_c / block_size - -template -__global__ void _ipk_1_3d_amr(int nr, int nc, int nf_c, T *am, T *bm, T *dist_f, - T *v, int ldv1, int ldv2, bool retrieve, - int block_size, T *fv, int ldfv1, int ldfv2, - T *bv, int ldbv1, int ldbv2) { - - int c_gl = blockIdx.x * C; - int r_gl = blockIdx.y * R; - int f_gl = threadIdx.x; - - int c_sm = threadIdx.x; - int r_sm = threadIdx.y; - int f_sm = threadIdx.x; - - T *vec = v + get_idx(ldv1, ldv2, r_gl, c_gl, 0); - T *sm = SharedMemory(); - int ldsm1 = F + G; - int ldsm2 = C; - T *vec_sm = sm; - T *bm_sm = sm + R * ldsm1 * ldsm2; - T *dist_sm = bm_sm + ldsm1; - - register T prev_vec_sm = 0.0; - - int c_rest = min(C, nc - blockIdx.x * C); - int r_rest = min(R, nr - blockIdx.y * R); - - int f_rest = nf_c; - int f_ghost = min(nf_c, G); - int f_main = F; - int f_progress = 0; - - // printf("r_sm: %d, r_rest: %d, c_sm: %d, c_rest: %d f_sm: %d, f_rest %d , - // nf_c: %d\n", r_sm, r_rest, c_sm, c_rest, f_sm, f_rest, nf_c); - - // printf("test %f", vec_sm[get_idx(ldsm1, ldsm2, 0, 1, 0)]); - /* Load first ghost */ - if (r_sm < r_rest && f_sm < f_ghost) { - for (int i = 0; i < c_rest; i++) { - vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] = - vec[get_idx(ldv1, ldv2, r_sm, i, f_gl)]; - // if (r_sm == 0) printf("r0_stride = %d, vec_sm[%d] = %f\n", r0_stride, - // i, vec_sm[i * ldsm + c_sm]); - } - if (r_sm == 0) - bm_sm[f_sm] = bm[f_gl]; - } - - f_rest -= f_ghost; - __syncthreads(); - - while (f_rest > F - f_ghost) { - // if (c_gl == 0 && c_sm == 0 && r_gl == 0 && r_sm == 0) printf("%d %d\n", - // f_rest, F - f_ghost); - f_main = min(F, f_rest); - if (r_sm < r_rest && f_sm < f_main) { - for (int i = 0; i < c_rest; i++) { - vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm + f_ghost)] = - vec[get_idx(ldv1, ldv2, r_sm, i, f_gl + f_ghost)]; - } - if (r_sm == 0) - bm_sm[f_sm + f_ghost] = bm[f_gl + f_ghost]; - } - - __syncthreads(); - - /* Computation of v in parallel*/ - if (r_sm < r_rest && c_sm < c_rest) { - // if (r_gl == 0 && c_gl == 0 && r_sm == 0 && c_sm == 0) printf("%f + %f * - // %f -> %f\n", - // vec_sm[get_idx(ldsm1, - // ldsm2, r_sm, c_sm, 0)], - // prev_vec_sm, bm_sm[0], - // vec_sm[get_idx(ldsm1, - // ldsm2, r_sm, c_sm, - // 0)]+prev_vec_sm * - // bm_sm[0]); - - // store fv - if (f_progress % block_size == 0) { - printf("f_progress0: %d, fv: %f\n", f_progress, prev_vec_sm); - fv[get_idx(ldfv1, ldfv2, r_gl + r_sm, f_progress / block_size, - c_gl + c_sm)] = prev_vec_sm; - } - - vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] = tridiag_forward( - prev_vec_sm, bm_sm[0], vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)]); - - f_progress++; - - //#pragma unroll 32 - for (int i = 1; i < F; i++) { - // store fv - if (f_progress % block_size == 0) { - printf("f_progress1: %d, fv: %f\n", f_progress, - vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i - 1)]); - fv[get_idx(ldfv1, ldfv2, r_gl + r_sm, f_progress / block_size, - c_gl + c_sm)] = - vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i - 1)]; - } - - vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)] = tridiag_forward( - vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i - 1)], bm_sm[i], - vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)]); - - f_progress++; - // printf("calc[%d]: %f\n", i, vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, - // i)]); if (r_gl == 0 && c_gl == 0) - // printf("out[%d %d %d] %f\n", r_sm, c_sm, i, vec_sm[get_idx(ldsm1, - // ldsm2, r_sm, c_sm, i)]); - } - - /* Store last v */ - prev_vec_sm = vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, F - 1)]; - } - __syncthreads(); - - /* flush results to v */ - if (r_sm < r_rest && f_sm < F) { - for (int i = 0; i < c_rest; i++) { - // if (blockIdx.x == 0 && blockIdx.y == 0 && r_sm == 0 && i == 1) { - // printf("store [%d %d %d] %f<-%f [%d %d %d]\n", - // r_sm, i, f_gl, vec[get_idx(ldv1, ldv2, r_sm, i, f_gl)], - // vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)], r_sm, i, f_sm); - // } - vec[get_idx(ldv1, ldv2, r_sm, i, f_gl)] = - vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)]; - // if (blockIdx.x == 0 && blockIdx.y == 0 && r_sm == 0 && i == 1) { - // printf("store [%d %d %d] %f<-%f [%d %d %d]\n", - // r_sm, i, f_gl, vec[get_idx(ldv1, ldv2, r_sm, i, f_gl)], - // vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)], r_sm, i, f_sm); - // } - } - } - __syncthreads(); - - /* Update unloaded col */ - f_rest -= f_main; - - /* Advance c */ - f_gl += F; - - // f_progress += F; - - /* Copy next ghost to main */ - f_ghost = min(G, f_main - (F - G)); - if (r_sm < r_rest && f_sm < f_ghost) { - for (int i = 0; i < c_rest; i++) { - vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] = - vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm + F)]; - } - if (r_sm == 0) - bm_sm[f_sm] = bm_sm[f_sm + blockDim.x]; - } - __syncthreads(); - } // end of while - - /* Load all rest col */ - if (r_sm < r_rest && f_sm < f_rest) { - for (int i = 0; i < c_rest; i++) { - vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm + f_ghost)] = - vec[get_idx(ldv1, ldv2, r_sm, i, f_gl + f_ghost)]; - } - if (r_sm == 0) - bm_sm[f_sm + f_ghost] = bm[f_gl + f_ghost]; - } - - __syncthreads(); - - /* Only 1 col remain */ - if (f_ghost + f_rest == 1) { - if (r_sm < r_rest && c_sm < c_rest) { - // #ifdef MGARD_X_FMA - // vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] = - // __fma_rn(prev_vec_sm, bm_sm[0], vec_sm[get_idx(ldsm1, ldsm2, - // r_sm, c_sm, 0)]); - // #else - // vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] -= prev_vec_sm * - // bm_sm[0]; - // #endif - if ((f_progress) % block_size == 0) { - printf("extra f_progress2: %d, fv(%d %d %d): %f\n", f_progress, - f_progress / block_size, r_gl, c_gl, prev_vec_sm); - fv[get_idx(ldfv1, ldfv2, r_gl + r_sm, f_progress / block_size, - c_gl + c_sm)] = prev_vec_sm; - } - - vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] = tridiag_forward( - prev_vec_sm, bm_sm[0], vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)]); - - f_progress++; - // printf ("prev_vec_sm = %f\n", prev_vec_sm ); - // printf ("vec_sm[r_sm * ldsm + 0] = %f\n", vec_sm[r_sm * ldsm + 0] ); - } - //__syncthreads(); - - } else { - if (r_sm < r_rest && c_sm < c_rest) { - // #ifdef MGARD_X_FMA - // vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] = - // __fma_rn(prev_vec_sm, bm_sm[0], vec_sm[get_idx(ldsm1, ldsm2, - // r_sm, c_sm, 0)]); - // #else - // vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] -= prev_vec_sm * - // bm_sm[0]; - // #endif - if ((f_progress) % block_size == 0) { - printf("extra f_progress3: %d, fv: %f\n", f_progress, prev_vec_sm); - fv[get_idx(ldfv1, ldfv2, r_gl + r_sm, (f_progress) / block_size, - c_gl + c_sm)] = prev_vec_sm; - } - vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] = tridiag_forward( - prev_vec_sm, bm_sm[0], vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)]); - - f_progress++; - - for (int i = 1; i < f_ghost + f_rest; i++) { - // #ifdef MGARD_X_FMA - // vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)] = - // __fma_rn(vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i - 1)], - // bm_sm[i], - // vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)]); - // #else - // vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)] -= - // vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i - 1)] * bm_sm[i]; - // #endif - - if (f_progress % block_size == 0) { - printf("extra f_progress4: %d, fv: %f\n", f_progress, - vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i - 1)]); - fv[get_idx(ldfv1, ldfv2, r_gl + r_sm, f_progress / block_size, - c_gl + c_sm)] = - vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i - 1)]; - } - - vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)] = tridiag_forward( - vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i - 1)], bm_sm[i], - vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)]); - - f_progress++; - } - } - } - __syncthreads(); - /* flush results to v */ - if (r_sm < r_rest && f_sm < f_ghost + f_rest) { - for (int i = 0; i < c_rest; i++) { - vec[get_idx(ldv1, ldv2, r_sm, i, f_gl)] = - vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)]; - // printf("c_stride = %d, c_sm = %d, vec_sm = %f, vec[%d] = - // %f\n",c_stride, c_sm, vec_sm[r_sm * ldsm + 0],i * row_stride * lddv + - // c_stride, vec[i * row_stride * lddv + c_stride]); - } - } - __syncthreads(); - - f_progress--; - - /* backward */ - T *am_sm = bm_sm; - f_rest = nf_c; - f_ghost = min(nf_c, G); - f_main = F; - f_gl = threadIdx.x; - prev_vec_sm = 0.0; - - /* Load first ghost */ - if (r_sm < r_rest && f_sm < f_ghost) { - for (int i = 0; i < c_rest; i++) { - vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] = - vec[get_idx(ldv1, ldv2, r_sm, i, (nf_c - 1) - f_gl)]; - // if (r_sm == 0) printf("r0_stride = %d, vec_sm[%d] = %f\n", r0_stride, - // i, vec_sm[i * ldsm + c_sm]); - } - } - if (r_sm == 0) { - am_sm[f_sm] = am[(nf_c - 1) - f_gl]; - dist_sm[f_sm] = dist_f[(nf_c - 1) - f_gl]; // * -1; - } - f_rest -= f_ghost; - __syncthreads(); - - while (f_rest > F - f_ghost) { - f_main = min(F, f_rest); - if (r_sm < r_rest && f_sm < f_main) { - for (int i = 0; i < c_rest; i++) { - vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm + f_ghost)] = - vec[get_idx(ldv1, ldv2, r_sm, i, (nf_c - 1) - f_gl - f_ghost)]; - } - } - if (r_sm == 0) { - am_sm[f_sm + f_ghost] = am[(nf_c - 1) - f_gl - f_ghost]; - dist_sm[f_sm + f_ghost] = dist_f[(nf_c - 1) - f_gl - f_ghost]; // * -1; - } - __syncthreads(); - - /* Computation of v in parallel*/ - if (r_sm < r_rest && c_sm < c_rest) { - // vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] = - // (vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] - dist_sm[0] * - // prev_vec_sm) / am_sm[0]; if (r_gl == 0 && c_gl == 0 && r_sm == 0 && - // c_sm == 0) - // printf("(%f + %f * %f) * %f -> %f\n", - // vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)], - // dist_sm[0], prev_vec_sm, am_sm[0], - // (vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] - dist_sm[0] - // * prev_vec_sm) / am_sm[0]); - - // #ifdef MGARD_X_FMA - // vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] = - // __fma_rn(dist_sm[0], prev_vec_sm, - // vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)]) * am_sm[0]; - // #else - // vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] = - // (vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] - dist_sm[0] * - // prev_vec_sm) / am_sm[0]; - // #endif - - vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] = - tridiag_backward(prev_vec_sm, dist_sm[0], am_sm[0], - vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)]); - - if (f_progress > 0 && f_progress % block_size == 0) { - printf("f_progress5: %d, bv: %f\n", f_progress, - vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)]); - bv[get_idx(ldbv1, ldbv2, r_gl + r_sm, f_progress / block_size - 1, - c_gl + c_sm)] = vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)]; - } - - f_progress--; - - //#pragma unroll 32 - for (int i = 1; i < F; i++) { - // if (r_gl == 0 && c_gl == 0 && r_sm == 0 && c_sm == 0) - // printf("(%f + %f * %f) * %f -> %f\n", - // vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)], - // dist_sm[i], vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i-1)], - // (vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)] - dist_sm[i] - // * vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i-1)]) * - // am_sm[i]); - - // #ifdef MGARD_X_FMA - // vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)] = - // __fma_rn(dist_sm[i], vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i - // - 1)], - // vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)]) * am_sm[i]; - // #else - // vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)] = - // (vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)] - - // dist_sm[i] * vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i - - // 1)]) / am_sm[i]; - // #endif - vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)] = tridiag_backward( - vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i - 1)], dist_sm[i], - am_sm[i], vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)]); - - if (f_progress > 0 && f_progress % block_size == 0) { - printf("f_progress6: %d, bv: %f\n", f_progress, - vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)]); - bv[get_idx(ldbv1, ldbv2, r_gl + r_sm, f_progress / block_size - 1, - c_gl + c_sm)] = - vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)]; - } - - f_progress--; - } - /* Store last v */ - prev_vec_sm = vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, blockDim.x - 1)]; - } - __syncthreads(); - - /* flush results to v */ - if (r_sm < r_rest && f_sm < F) { - for (int i = 0; i < c_rest; i++) { - vec[get_idx(ldv1, ldv2, r_sm, i, (nf_c - 1) - f_gl)] = - vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)]; - } - } - __syncthreads(); - - /* Update unloaded col */ - f_rest -= f_main; - - /* Advance c */ - f_gl += F; - - /* Copy next ghost to main */ - f_ghost = min(G, f_main - (F - G)); - if (r_sm < r_rest && f_sm < f_ghost) { - for (int i = 0; i < c_rest; i++) { - vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] = - vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm + F)]; - } - if (r_sm == 0) { - am_sm[f_sm] = am_sm[f_sm + F]; - dist_sm[f_sm] = dist_sm[f_sm + F]; - } - } - __syncthreads(); - } // end of while - - /* Load all rest col */ - if (r_sm < r_rest && f_sm < f_rest) { - for (int i = 0; i < c_rest; i++) { - vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm + f_ghost)] = - vec[get_idx(ldv1, ldv2, r_sm, i, (nf_c - 1) - f_gl - f_ghost)]; - } - } - if (r_sm == 0) { - am_sm[f_sm + f_ghost] = am[(nf_c - 1) - f_gl - f_ghost]; - dist_sm[f_sm + f_ghost] = dist_f[(nf_c - 1) - f_gl - f_ghost]; - } - __syncthreads(); - - /* Only 1 col remain */ - if (f_ghost + f_rest == 1) { - if (r_sm < r_rest && c_sm < c_rest) { - // #ifdef MGARD_X_FMA - // vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] = - // __fma_rn(dist_sm[0], prev_vec_sm, - // vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)]) * am_sm[0]; - // #else - // vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] = - // (vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] - dist_sm[0] * - // prev_vec_sm) / am_sm[0]; - // #endif - vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] = - tridiag_backward(prev_vec_sm, dist_sm[0], am_sm[0], - vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)]); - - if (f_progress > 0 && f_progress % block_size == 0) { - printf("f_progress7: %d, bv: %f\n", f_progress, - vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)]); - bv[get_idx(ldbv1, ldbv2, r_gl + r_sm, f_progress / block_size - 1, - c_gl + c_sm)] = vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)]; - } - - f_progress--; - // printf ("prev_vec_sm = %f\n", prev_vec_sm ); - // printf ("vec_sm[r_sm * ldsm + 0] = %f\n", vec_sm[r_sm * ldsm + 0] ); - } - //__syncthreads(); - - } else { - if (r_sm < r_rest && c_sm < c_rest) { - - // #ifdef MGARD_X_FMA - // vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] = - // __fma_rn(dist_sm[0], prev_vec_sm, - // vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)]) * am_sm[0]; - // #else - // vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] = - // (vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] - dist_sm[0] * - // prev_vec_sm) / am_sm[0]; - // #endif - vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] = - tridiag_backward(prev_vec_sm, dist_sm[0], am_sm[0], - vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)]); - - if (f_progress > 0 && f_progress % block_size == 0) { - printf("f_progress8: %d, bv: %f\n", f_progress, - vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)]); - bv[get_idx(ldbv1, ldbv2, r_gl + r_sm, f_progress / block_size - 1, - c_gl + c_sm)] = vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)]; - } - - f_progress--; - - for (int i = 1; i < f_ghost + f_rest; i++) { - // #ifdef MGARD_X_FMA - // vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)] = - // __fma_rn(dist_sm[i], vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i - // - 1)], - // vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)]) * am_sm[i]; - // #else - // vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)] = - // (vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)] - - // vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i - 1)] * - // dist_sm[i]) / am_sm[i]; - // #endif - vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)] = tridiag_backward( - vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i - 1)], dist_sm[i], - am_sm[i], vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)]); - - if (f_progress > 0 && f_progress % block_size == 0) { - printf("f_progress9: %d, bv: %f\n", f_progress, - vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)]); - bv[get_idx(ldbv1, ldbv2, r_gl + r_sm, f_progress / block_size - 1, - c_gl + c_sm)] = - vec_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, i)]; - } - - f_progress--; - } - } - } - __syncthreads(); - /* flush results to v */ - if (r_sm < r_rest && f_sm < f_ghost + f_rest) { - for (int i = 0; i < c_rest; i++) { - vec[get_idx(ldv1, ldv2, r_sm, i, (nf_c - 1) - f_gl)] = - vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)]; - // printf("c_stride = %d, c_sm = %d, vec_sm = %f, vec[%d] = - // %f\n",c_stride, c_sm, vec_sm[r_sm * ldsm + 0],i * row_stride * lddv + - // c_stride, vec[i * row_stride * lddv + c_stride]); - } - } - __syncthreads(); -} - -template -void ipk_1_3d_amr_adaptive_launcher(Handle &handle, int nr, int nc, - int nf_c, T *am, T *bm, T *ddist_f, T *dv, - int lddv1, int lddv2, bool retrieve, - int block_size, T *fv, int ldfv1, int ldfv2, - T *bv, int ldbv1, int ldbv2, - int queue_idx) { - // std::cout << "test\n"; - - int total_thread_x = nc; - int total_thread_y = nr; - int total_thread_z = 1; - int tbx, tby, tbz, gridx, gridy, gridz; - dim3 threadsPerBlock, blockPerGrid; - size_t sm_size; - - tbx = C; // std::max(C, std::min(C, total_thread_x)); - tby = R; // std::max(R, std::min(R, total_thread_y)); - tbz = 1; - sm_size = (R * C + 2) * (F + G) * sizeof(T); - gridx = ceil((float)total_thread_x / tbx); - gridy = ceil((float)total_thread_y / tby); - gridz = 1; - threadsPerBlock = dim3(F, tby, tbz); - blockPerGrid = dim3(gridx, gridy, gridz); - - _ipk_1_3d_amr<<>>( - nr, nc, nf_c, am, bm, ddist_f, dv, lddv1, lddv2, retrieve, block_size, fv, - ldfv1, ldfv2, bv, ldbv1, ldbv2); - gpuErrchk(cudaGetLastError()); - if (handle.sync_and_check_all_kernels) { - gpuErrchk(cudaDeviceSynchronize()); - } - // std::cout << "test\n"; -} - -template -void ipk_1_3d_amr(Handle &handle, int nr, int nc, int nf_c, T *am, T *bm, - T *ddist_f, T *dv, int lddv1, int lddv2, bool retrieve, - int block_size, T *fv, int ldfv1, int ldfv2, T *bv, int ldbv1, - int ldbv2, int queue_idx, int config) { - -#define IPK(R, C, F, G) \ - { \ - ipk_1_3d_amr_adaptive_launcher( \ - handle, nr, nc, nf_c, am, bm, ddist_f, dv, lddv1, lddv2, retrieve, \ - block_size, fv, ldfv1, ldfv2, bv, ldbv1, ldbv2, queue_idx); \ - } - bool profile = false; - if (handle.profile_kernels) { - profile = true; - } - if (D == 3) { - if (profile || config == 6) { - IPK(2, 2, 128, 2) - } - if (profile || config == 5) { - IPK(2, 2, 64, 2) - } - if (profile || config == 4) { - IPK(2, 2, 32, 2) - } - if (profile || config == 3) { - IPK(4, 4, 16, 4) - } - if (profile || config == 2) { - IPK(8, 8, 8, 4) - } - if (profile || config == 1) { - IPK(4, 4, 4, 4) - } - if (profile || config == 0) { - IPK(2, 2, 2, 2) - } - } else if (D == 2) { - if (profile || config == 6) { - IPK(1, 2, 128, 2) - } - if (profile || config == 5) { - IPK(1, 2, 64, 2) - } - if (profile || config == 4) { - IPK(1, 2, 32, 2) - } - if (profile || config == 3) { - IPK(1, 4, 16, 4) - } - if (profile || config == 2) { - IPK(1, 8, 8, 4) - } - if (profile || config == 1) { - IPK(1, 4, 4, 4) - } - if (profile || config == 0) { - IPK(1, 2, 4, 2) - } - } else if (D == 1) { - if (profile || config == 6) { - IPK(1, 1, 128, 2) - } - if (profile || config == 5) { - IPK(1, 1, 64, 2) - } - if (profile || config == 4) { - IPK(1, 1, 32, 2) - } - if (profile || config == 3) { - IPK(1, 1, 16, 4) - } - if (profile || config == 2) { - IPK(1, 1, 8, 4) - } - if (profile || config == 1) { - IPK(1, 1, 8, 4) - } - if (profile || config == 0) { - IPK(1, 1, 8, 2) - } - } -#undef IPK -} - -#if 0 -template -__global__ void _ipk_2_3d(int nr, int nc_c, int nf_c, T *am, T *bm, T *dist_c, - T *v, int ldv1, int ldv2) { - - int f_gl = blockIdx.x * F; - int r_gl = blockIdx.y * R; - int c_gl = 0; - - int f_sm = threadIdx.x; - int r_sm = threadIdx.y; - int c_sm = threadIdx.x; - - T *vec = v + get_idx(ldv1, ldv2, r_gl, 0, f_gl); - T *sm = SharedMemory(); - int ldsm1 = F; - int ldsm2 = C + G; - T *vec_sm = sm; - T *bm_sm = sm + R * ldsm1 * ldsm2; - T *dist_sm = bm_sm + ldsm2; - - register T prev_vec_sm = 0.0; - - int f_rest = min(F, nf_c - blockIdx.x * F); - int r_rest = min(R, nr - blockIdx.y * R); - - // if (blockIdx.x == 1 && blockIdx.y == 0 && f_sm == 0 && r_sm == 0) { - // printf("f_rest: %d r_rest: %d\n", f_rest, r_rest); - // } - - int c_rest = nc_c; - int c_ghost = min(nc_c, G); - int c_main = C; - - /* Load first ghost */ - if (r_sm < r_rest && f_sm < f_rest) { - for (int i = 0; i < c_ghost; i++) { - vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] = - vec[get_idx(ldv1, ldv2, r_sm, c_gl + i, f_sm)]; - // if (r_sm == 0) printf("r0_stride = %d, vec_sm[%d] = %f\n", r0_stride, - // i, vec_sm[i * ldsm + c_sm]); - } - } - if (r_sm == 0 && c_sm < c_ghost) - bm_sm[c_sm] = bm[c_gl + c_sm]; - c_rest -= c_ghost; - __syncthreads(); - - while (c_rest > C - c_ghost) { - // printf("%d %d %d\n", c_rest, C, c_ghost); - c_main = min(C, c_rest); - if (r_sm < r_rest && f_sm < f_rest) { - for (int i = 0; i < c_main; i++) { - vec_sm[get_idx(ldsm1, ldsm2, r_sm, i + c_ghost, f_sm)] = - vec[get_idx(ldv1, ldv2, r_sm, c_gl + i + c_ghost, f_sm)]; - } - } - if (r_sm == 0 && c_sm < c_main) - bm_sm[c_sm + c_ghost] = bm[c_gl + c_sm + c_ghost]; - __syncthreads(); - - /* Computation of v in parallel*/ - if (r_sm < r_rest && f_sm < f_rest) { - - // #ifdef MGARD_X_FMA - // vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)] = - // __fma_rn(prev_vec_sm, bm_sm[0], vec_sm[get_idx(ldsm1, ldsm2, - // r_sm, 0, f_sm)]); - // #else - // vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)] -= prev_vec_sm * - // bm_sm[0]; - // #endif - vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)] = tridiag_forward( - prev_vec_sm, bm_sm[0], vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)]); - - for (int i = 1; i < C; i++) { - // #ifdef MGARD_X_FMA - // vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] = - // __fma_rn(vec_sm[get_idx(ldsm1, ldsm2, r_sm, i - 1, f_sm)], - // bm_sm[i], - // vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)]); - // #else - // // if (blockIdx.x == 1 && blockIdx.y == 0 && f_sm == 0 && r_sm - // == 0) { - // // printf("calc: %f %f %f -> %f \n", vec_sm[get_idx(ldsm1, - // ldsm2, r_sm, i, f_sm)], - // // vec_sm[get_idx(ldsm1, ldsm2, r_sm, i - 1, f_sm)], - // bm_sm[i], vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] - - // // vec_sm[get_idx(ldsm1, ldsm2, r_sm, i - 1, f_sm)] * - // bm_sm[i]); - // // } - - // vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] -= - // vec_sm[get_idx(ldsm1, ldsm2, r_sm, i - 1, f_sm)] * bm_sm[i]; - // #endif - vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] = tridiag_forward( - vec_sm[get_idx(ldsm1, ldsm2, r_sm, i - 1, f_sm)], bm_sm[i], - vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)]); - } - /* Store last v */ - prev_vec_sm = vec_sm[get_idx(ldsm1, ldsm2, r_sm, C - 1, f_sm)]; - } - __syncthreads(); - - /* flush results to v */ - if (r_sm < r_rest && f_sm < f_rest) { - for (int i = 0; i < C; i++) { - // if (blockIdx.x == 1 && blockIdx.y == 0 && f_sm == 0 && r_sm == 0) { - // printf("store: %f\n", vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, - // f_sm)]); - // } - vec[get_idx(ldv1, ldv2, r_sm, c_gl + i, f_sm)] = - vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)]; - } - } - __syncthreads(); - - /* Update unloaded col */ - c_rest -= c_main; - - /* Advance c */ - c_gl += C; - - /* Copy next ghost to main */ - c_ghost = min(G, c_main - (C - G)); - if (r_sm < r_rest && f_sm < f_rest) { - for (int i = 0; i < c_ghost; i++) { - vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] = - vec_sm[get_idx(ldsm1, ldsm2, r_sm, i + C, f_sm)]; - } - } - if (r_sm == 0 && c_sm < c_ghost) - bm_sm[c_sm] = bm_sm[c_sm + C]; - __syncthreads(); - - } // end of while - - /* Load all rest col */ - if (r_sm < r_rest && f_sm < f_rest) { - for (int i = 0; i < c_rest; i++) { - vec_sm[get_idx(ldsm1, ldsm2, r_sm, i + c_ghost, f_sm)] = - vec[get_idx(ldv1, ldv2, r_sm, c_gl + i + c_ghost, f_sm)]; - } - } - if (r_sm == 0 && c_sm < c_rest) - bm_sm[c_sm + c_ghost] = bm[c_gl + c_sm + c_ghost]; - __syncthreads(); - - /* Only 1 col remain */ - if (c_ghost + c_rest == 1) { - if (r_sm < r_rest && f_sm < f_rest) { - // vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)] -= prev_vec_sm * bm_sm[0]; - // #ifdef MGARD_X_FMA - // vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)] = - // __fma_rn(prev_vec_sm, bm_sm[0], vec_sm[get_idx(ldsm1, ldsm2, - // r_sm, 0, f_sm)]); - // #else - // vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)] -= prev_vec_sm * - // bm_sm[0]; - // #endif - vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)] = tridiag_forward( - prev_vec_sm, bm_sm[0], vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)]); - // printf ("prev_vec_sm = %f\n", prev_vec_sm ); - // printf ("vec_sm[r_sm * ldsm + 0] = %f\n", vec_sm[r_sm * ldsm + 0] ); - } - //__syncthreads(); - - } else { - if (r_sm < r_rest && f_sm < f_rest) { - // #ifdef MGARD_X_FMA - // vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)] = - // __fma_rn(prev_vec_sm, bm_sm[0], vec_sm[get_idx(ldsm1, ldsm2, - // r_sm, 0, f_sm)]); - // #else - // vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)] -= prev_vec_sm * - // bm_sm[0]; - // #endif - vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)] = tridiag_forward( - prev_vec_sm, bm_sm[0], vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)]); - for (int i = 1; i < c_ghost + c_rest; i++) { - // #ifdef MGARD_X_FMA - // vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] = - // __fma_rn(vec_sm[get_idx(ldsm1, ldsm2, r_sm, i - 1, f_sm)], - // bm_sm[i], - // vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)]); - // #else - // vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] -= - // vec_sm[get_idx(ldsm1, ldsm2, r_sm, i - 1, f_sm)] * bm_sm[i]; - // #endif - vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] = tridiag_forward( - vec_sm[get_idx(ldsm1, ldsm2, r_sm, i - 1, f_sm)], bm_sm[i], - vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)]); - } - } - } - __syncthreads(); - /* flush results to v */ - if (r_sm < r_rest && f_sm < f_rest) { - for (int i = 0; i < c_ghost + c_rest; i++) { - vec[get_idx(ldv1, ldv2, r_sm, c_gl + i, f_sm)] = - vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)]; - // printf("c_stride = %d, c_sm = %d, vec_sm = %f, vec[%d] = - // %f\n",c_stride, c_sm, vec_sm[r_sm * ldsm + 0],i * row_stride * lddv + - // c_stride, vec[i * row_stride * lddv + c_stride]); - } - } - __syncthreads(); - - /* backward */ - T *am_sm = bm_sm; - c_rest = nc_c; - c_ghost = min(nc_c, G); - c_main = C; - c_gl = 0; - prev_vec_sm = 0.0; - - // if (f_gl+f_sm == 0 && r_gl+r_sm == 0 && idx[3] == 0) debug = false; - // if (debug) printf("block id: (%d %d %d) thread id: (%d %d %d)\n", - // blockIdx.x, blockIdx.y, blockIdx.z, - // threadIdx.x, threadIdx.y, threadIdx.z); - - /* Load first ghost */ - if (r_sm < r_rest && f_sm < f_rest) { - for (int i = 0; i < c_ghost; i++) { - vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] = - vec[get_idx(ldv1, ldv2, r_sm, (nc_c - 1) - (c_gl + i), f_sm)]; - // if (debug) printf("load vec_sm[%d] = %f\n", get_idx(ldsm1, ldsm2, r_sm, - // i, f_sm), vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)]); - } - } - if (r_sm == 0 && c_sm < c_ghost) { - am_sm[c_sm] = am[(nc_c - 1) - (c_gl + c_sm)]; - dist_sm[c_sm] = dist_c[(nc_c - 1) - (c_gl + c_sm)]; - } - c_rest -= c_ghost; - __syncthreads(); - - while (c_rest > C - c_ghost) { - // printf("%d %d %d\n", c_rest, C, c_ghost); - c_main = min(C, c_rest); - if (r_sm < r_rest && f_sm < f_rest) { - for (int i = 0; i < c_main; i++) { - vec_sm[get_idx(ldsm1, ldsm2, r_sm, i + c_ghost, f_sm)] = vec[get_idx( - ldv1, ldv2, r_sm, (nc_c - 1) - (c_gl + i + c_ghost), f_sm)]; - // if (debug) printf("load vec_sm[%d] = %f\n", get_idx(ldsm1, ldsm2, - // r_sm, i + c_ghost, f_sm), vec_sm[get_idx(ldsm1, ldsm2, r_sm, i + - // c_ghost, f_sm)]); - } - } - if (r_sm == 0 && c_sm < c_main) { - am_sm[c_sm + c_ghost] = am[(nc_c - 1) - (c_gl + c_sm + c_ghost)]; - dist_sm[c_sm + c_ghost] = dist_c[(nc_c - 1) - (c_gl + c_sm + c_ghost)]; - } - __syncthreads(); - - // if (r_gl == 0 && f_gl == 0 && r_sm == 0 && f_sm == 0) - // printf("*****test\n"); - /* Computation of v in parallel*/ - if (r_sm < r_rest && f_sm < f_rest) { - // #ifdef MGARD_X_FMA - // vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)] = - // __fma_rn(dist_sm[0], prev_vec_sm, vec_sm[get_idx(ldsm1, ldsm2, - // r_sm, 0, f_sm)]) * am_sm[0]; - // #else - // // if (r_gl == 0 && f_gl == 0 && r_sm == 0 && f_sm == 0) - // // printf("(%f + %f * %f) * %f -> %f\n", - // // vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)], - // // dist_sm[0], prev_vec_sm, am_sm[0], - // // (vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)] - - // dist_sm[0] * prev_vec_sm) / am_sm[0]); vec_sm[get_idx(ldsm1, - // ldsm2, r_sm, 0, f_sm)] = (vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, - // f_sm)] - dist_sm[0] * prev_vec_sm) / am_sm[0]; - // #endif - vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, c_sm)] = - tridiag_backward(prev_vec_sm, dist_sm[0], am_sm[0], - vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, c_sm)]); - // if (debug) printf("calc vec_sm[%d] = %f\n", get_idx(ldsm1, ldsm2, r_sm, - // 0, f_sm), vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)]); - - for (int i = 1; i < C; i++) { - // #ifdef MGARD_X_FMA - // vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] = - // __fma_rn(dist_sm[i], vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, - // f_sm)], - // vec_sm[get_idx(ldsm1, ldsm2, r_sm, i - 1, f_sm)]) * am_sm[i]; - // #else - // // if (r_gl == 0 && f_gl == 0 && r_sm == 0 && f_sm == 0) - // // printf("(%f + %f * %f) * %f -> %f\n", - // // vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)], - // // dist_sm[i], vec_sm[get_idx(ldsm1, ldsm2, r_sm, - // i-1, f_sm)], am_sm[i], - // // (vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] - - // // dist_sm[i] * vec_sm[get_idx(ldsm1, ldsm2, r_sm, i - 1, - // f_sm)]) / am_sm[i]); - - // vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] = - // (vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] - - // dist_sm[i] * vec_sm[get_idx(ldsm1, ldsm2, r_sm, i - 1, - // f_sm)]) / am_sm[i]; - - // #endif - vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] = tridiag_backward( - vec_sm[get_idx(ldsm1, ldsm2, r_sm, i - 1, f_sm)], dist_sm[i], - am_sm[i], vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)]); - - // if (debug) printf("calc vec_sm[%d] = %f\n", get_idx(ldsm1, ldsm2, - // r_sm, i, f_sm), vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)]); - } - - /* Store last v */ - prev_vec_sm = vec_sm[get_idx(ldsm1, ldsm2, r_sm, C - 1, f_sm)]; - } - __syncthreads(); - - /* flush results to v */ - if (r_sm < r_rest && f_sm < f_rest) { - for (int i = 0; i < C; i++) { - vec[get_idx(ldv1, ldv2, r_sm, (nc_c - 1) - (c_gl + i), f_sm)] = - vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)]; - } - } - __syncthreads(); - - /* Update unloaded col */ - c_rest -= c_main; - - /* Advance c */ - c_gl += C; - - /* Copy next ghost to main */ - c_ghost = min(G, c_main - (C - G)); - if (r_sm < r_rest && f_sm < f_rest) { - for (int i = 0; i < c_ghost; i++) { - vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] = - vec_sm[get_idx(ldsm1, ldsm2, r_sm, i + C, f_sm)]; - } - } - if (r_sm == 0 && c_sm < c_ghost) { - am_sm[c_sm] = am_sm[c_sm + C]; - dist_sm[c_sm] = dist_sm[c_sm + C]; - } - __syncthreads(); - - } // end of while - - // Load all rest col - if (r_sm < r_rest && f_sm < f_rest) { - for (int i = 0; i < c_rest; i++) { - vec_sm[get_idx(ldsm1, ldsm2, r_sm, i + c_ghost, f_sm)] = vec[get_idx( - ldv1, ldv2, r_sm, (nc_c - 1) - (c_gl + i + c_ghost), f_sm)]; - - // if (debug) printf("load ec_sm[%d] = %f\n", get_idx(ldsm1, ldsm2, r_sm, - // i + c_ghost, f_sm), vec_sm[get_idx(ldsm1, ldsm2, r_sm, i + c_ghost, - // f_sm)]); - } - } - if (r_sm == 0 && c_sm < c_rest) { - am_sm[c_sm + c_ghost] = am[(nc_c - 1) - (c_gl + c_sm + c_ghost)]; - dist_sm[c_sm + c_ghost] = dist_c[(nc_c - 1) - (c_gl + c_sm + c_ghost)]; - } - __syncthreads(); - - /* Only 1 col remain */ - if (c_ghost + c_rest == 1) { - if (r_sm < r_rest && f_sm < f_rest) { - // #ifdef MGARD_X_FMA - // vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)] = - // __fma_rn(dist_sm[0], prev_vec_sm, vec_sm[get_idx(ldsm1, ldsm2, - // r_sm, 0, f_sm)]) * am_sm[0]; - // #else - // // if (r_gl == 0 && f_gl == 0 && r_sm == 0 && f_sm == 0) - // // printf("(%f + %f * %f) * %f -> %f\n", - // // vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)], - // // dist_sm[0], prev_vec_sm, am_sm[0], - // // (vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)] - - // dist_sm[0] * prev_vec_sm) / am_sm[0]); vec_sm[get_idx(ldsm1, - // ldsm2, r_sm, 0, f_sm)] = (vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, - // f_sm)] - dist_sm[0] * prev_vec_sm) / am_sm[0]; - // #endif - vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, c_sm)] = - tridiag_backward(prev_vec_sm, dist_sm[0], am_sm[0], - vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, c_sm)]); - // if (debug) printf("calc vec_sm[%d] = %f\n", get_idx(ldsm1, ldsm2, r_sm, - // 0, f_sm), vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)]); - // printf ("prev_vec_sm = %f\n", prev_vec_sm ); - // printf ("vec_sm[r_sm * ldsm + 0] = %f\n", vec_sm[r_sm * ldsm + 0] ); - } - //__syncthreads(); - - } else { - if (r_sm < r_rest && f_sm < f_rest) { - // #ifdef MGARD_X_FMA - // vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)] = - // __fma_rn(dist_sm[0], prev_vec_sm, vec_sm[get_idx(ldsm1, ldsm2, - // r_sm, 0, f_sm)]) * am_sm[0]; - // #else - // // if (r_gl == 0 && f_gl == 0 && r_sm == 0 && f_sm == 0) - // // printf("(%f + %f * %f) * %f -> %f\n", - // // vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)], - // // dist_sm[0], prev_vec_sm, am_sm[0], - // // (vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)] - - // dist_sm[0] * prev_vec_sm) / am_sm[0]); vec_sm[get_idx(ldsm1, - // ldsm2, r_sm, 0, f_sm)] = (vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, - // f_sm)] - dist_sm[0] * prev_vec_sm) / am_sm[0]; - // #endif - vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, c_sm)] = - tridiag_backward(prev_vec_sm, dist_sm[0], am_sm[0], - vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, c_sm)]); - // if (debug) printf("calc vec_sm[%d] = %f\n", get_idx(ldsm1, ldsm2, r_sm, - // 0, f_sm), vec_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)]); - for (int i = 1; i < c_ghost + c_rest; i++) { - - // #ifdef MGARD_X_FMA - // vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] = - // __fma_rn(dist_sm[i], vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, - // f_sm)], - // vec_sm[get_idx(ldsm1, ldsm2, r_sm, i - 1, f_sm)]) * am_sm[i]; - // #else - // // if (r_gl == 0 && f_gl == 0 && r_sm == 0 && f_sm == 0) - // // printf("(%f + %f * %f) * %f -> %f\n", - // // vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)], - // // dist_sm[i], vec_sm[get_idx(ldsm1, ldsm2, r_sm, - // i-1, f_sm)], am_sm[i], - // // (vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] - - // // dist_sm[i] * vec_sm[get_idx(ldsm1, ldsm2, r_sm, i - 1, - // f_sm)]) / am_sm[i]); - // vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] = - // (vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] - - // dist_sm[i] * vec_sm[get_idx(ldsm1, ldsm2, r_sm, i - 1, - // f_sm)]) / am_sm[i]; - // #endif - vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)] = tridiag_backward( - vec_sm[get_idx(ldsm1, ldsm2, r_sm, i - 1, f_sm)], dist_sm[i], - am_sm[i], vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)]); - // if (debug) printf("calc vec_sm[%d] = %f\n", get_idx(ldsm1, ldsm2, - // r_sm, i, f_sm), vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)]); - } - } - } - __syncthreads(); - /* flush results to v */ - if (r_sm < r_rest && f_sm < f_rest) { - for (int i = 0; i < c_ghost + c_rest; i++) { - vec[get_idx(ldv1, ldv2, r_sm, (nc_c - 1) - (c_gl + i), f_sm)] = - vec_sm[get_idx(ldsm1, ldsm2, r_sm, i, f_sm)]; - // printf("c_stride = %d, c_sm = %d, vec_sm = %f, vec[%d] = - // %f\n",c_stride, c_sm, vec_sm[r_sm * ldsm + 0],i * row_stride * lddv + - // c_stride, vec[i * row_stride * lddv + c_stride]); - } - } - __syncthreads(); -} - -template -void ipk_2_3d_adaptive_launcher(Handle &handle, int nr, int nc_c, - int nf_c, T *am, T *bm, T *ddist_c, T *dv, - int lddv1, int lddv2, int queue_idx) { - int total_thread_x = nf_c; - int total_thread_y = nr; - int total_thread_z = 1; - int tbx, tby, tbz, gridx, gridy, gridz; - dim3 threadsPerBlock, blockPerGrid; - size_t sm_size; - - tbx = F;//std::max(F, std::min(F, total_thread_x)); - tby = R;//std::max(R, std::min(R, total_thread_y)); - tbz = 1; - sm_size = (R * F + 2) * (C + G) * sizeof(T); - gridx = ceil((float)total_thread_x / tbx); - gridy = ceil((float)total_thread_y / tby); - gridz = 1; - threadsPerBlock = dim3(tbx, tby, tbz); - blockPerGrid = dim3(gridx, gridy, gridz); - _ipk_2_3d<<>>( - nr, nc_c, nf_c, am, bm, ddist_c, dv, lddv1, lddv2); - gpuErrchk(cudaGetLastError()); - if (handle.sync_and_check_all_kernels) { - gpuErrchk(cudaDeviceSynchronize()); - } -} - -template -void ipk_2_3d(Handle &handle, int nr, int nc_c, int nf_c, T *am, T *bm, - T *ddist_c, T *dv, int lddv1, int lddv2, int queue_idx, - int config) { - -#define IPK(R, C, F, G) \ - { \ - ipk_2_3d_adaptive_launcher( \ - handle, nr, nc_c, nf_c, am, bm, ddist_c, dv, lddv1, lddv2, queue_idx); \ - } - bool profile = false; - if (handle.profile_kernels) { - profile = true; - } - if (D == 3) { - if (profile || config == 6) { - IPK(2, 2, 128, 2) - } - if (profile || config == 5) { - IPK(2, 2, 64, 2) - } - if (profile || config == 4) { - IPK(2, 2, 32, 2) - } - if (profile || config == 3) { - IPK(4, 4, 16, 4) - } - if (profile || config == 2) { - IPK(8, 8, 8, 4) - } - if (profile || config == 1) { - IPK(4, 4, 4, 4) - } - if (profile || config == 0) { - IPK(2, 2, 2, 2) - } - } else if (D == 2) { - if (profile || config == 6) { - IPK(1, 2, 128, 2) - } - if (profile || config == 5) { - IPK(1, 2, 64, 2) - } - if (profile || config == 4) { - IPK(1, 2, 32, 2) - } - if (profile || config == 3) { - IPK(1, 4, 16, 4) - } - if (profile || config == 2) { - IPK(1, 8, 8, 4) - } - if (profile || config == 1) { - IPK(1, 4, 4, 4) - } - if (profile || config == 0) { - IPK(1, 2, 4, 2) - } - } else { - printf("Error: ipk_2_3d is only for 3D and 2D data\n"); - } -#undef IPK -} - -template -__global__ void _ipk_3_3d(int nr_c, int nc_c, int nf_c, T *am, T *bm, T *dist_r, - T *v, int ldv1, int ldv2) { - - int f_gl = blockIdx.x * F; - int c_gl = blockIdx.y * C; - int r_gl = 0; - - int f_sm = threadIdx.x; - int c_sm = threadIdx.y; - int r_sm = threadIdx.x; - - T *vec = v + get_idx(ldv1, ldv2, 0, c_gl, f_gl); - T *sm = SharedMemory(); - int ldsm1 = F; - int ldsm2 = C; - T *vec_sm = sm; - T *bm_sm = sm + (R + G) * ldsm1 * ldsm2; - T *dist_sm = bm_sm + (R + G); - - register T prev_vec_sm = 0.0; - - int f_rest = min(F, nf_c - blockIdx.x * F); - int c_rest = min(C, nc_c - blockIdx.y * C); - - int r_rest = nr_c; - int r_ghost = min(nr_c, G); - int r_main = R; - - /* Load first ghost */ - if (c_sm < c_rest && f_sm < f_rest) { - for (int i = 0; i < r_ghost; i++) { - vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)] = - vec[get_idx(ldv1, ldv2, r_gl + i, c_sm, f_sm)]; - // if (r_sm == 0) printf("r0_stride = %d, vec_sm[%d] = %f\n", r0_stride, - // i, vec_sm[i * ldsm + c_sm]); - } - } - - if (c_sm == 0 && r_sm < r_ghost) - bm_sm[r_sm] = bm[r_gl + r_sm]; - r_rest -= r_ghost; - __syncthreads(); - - while (r_rest > R - r_ghost) { - r_main = min(R, r_rest); - if (c_sm < c_rest && f_sm < f_rest) { - for (int i = 0; i < r_main; i++) { - vec_sm[get_idx(ldsm1, ldsm2, i + r_ghost, c_sm, f_sm)] = - vec[get_idx(ldv1, ldv2, r_gl + i + r_ghost, c_sm, f_sm)]; - // printf("%d\n", r_gl + i + r_ghost); - } - } - if (c_sm == 0 && r_sm < r_main) - bm_sm[r_sm + r_ghost] = bm[r_gl + r_sm + r_ghost]; - __syncthreads(); - - /* Computation of v in parallel*/ - if (c_sm < c_rest && f_sm < f_rest) { - - // #ifdef MGARD_X_FMA - // vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] = - // __fma_rn(prev_vec_sm, bm_sm[0], vec_sm[get_idx(ldsm1, ldsm2, 0, - // c_sm, f_sm)]); - // #else - // vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] -= prev_vec_sm * - // bm_sm[0]; - // #endif - vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] = tridiag_forward( - prev_vec_sm, bm_sm[0], vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)]); - for (int i = 1; i < R; i++) { - // #ifdef MGARD_X_FMA - // vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)] = - // __fma_rn(vec_sm[get_idx(ldsm1, ldsm2, i - 1, c_sm, f_sm)], - // bm_sm[i], - // vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)]); - // #else - // vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)] -= - // vec_sm[get_idx(ldsm1, ldsm2, i - 1, c_sm, f_sm)] * bm_sm[i]; - // #endif - vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)] = tridiag_forward( - vec_sm[get_idx(ldsm1, ldsm2, i - 1, c_sm, f_sm)], bm_sm[i], - vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)]); - } - - /* Store last v */ - prev_vec_sm = vec_sm[get_idx(ldsm1, ldsm2, R - 1, c_sm, f_sm)]; - } - __syncthreads(); - - /* flush results to v */ - if (c_sm < c_rest && f_sm < f_rest) { - for (int i = 0; i < R; i++) { - vec[get_idx(ldv1, ldv2, r_gl + i, c_sm, f_sm)] = - vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)]; - } - } - __syncthreads(); - - // /* Update unloaded col */ - r_rest -= r_main; - - /* Advance c */ - r_gl += R; - - /* Copy next ghost to main */ - r_ghost = min(G, r_main - (R - G)); - if (c_sm < c_rest && f_sm < f_rest) { - for (int i = 0; i < r_ghost; i++) { - vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)] = - vec_sm[get_idx(ldsm1, ldsm2, i + R, c_sm, f_sm)]; - } - } - if (c_sm == 0 && r_sm < r_ghost) - bm_sm[r_sm] = bm_sm[r_sm + R]; - __syncthreads(); - - } // end of while - - /* Load all rest col */ - if (c_sm < c_rest && f_sm < f_rest) { - for (int i = 0; i < r_rest; i++) { - vec_sm[get_idx(ldsm1, ldsm2, i + r_ghost, c_sm, f_sm)] = - vec[get_idx(ldv1, ldv2, r_gl + i + r_ghost, c_sm, f_sm)]; - } - } - - if (c_sm == 0 && r_sm < r_rest) - bm_sm[r_sm + r_ghost] = bm[r_gl + r_sm + r_ghost]; - __syncthreads(); - - /* Only 1 col remain */ - if (r_ghost + r_rest == 1) { - if (c_sm < c_rest && f_sm < f_rest) { - - // #ifdef MGARD_X_FMA - // vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] = - // __fma_rn(prev_vec_sm, bm_sm[0], vec_sm[get_idx(ldsm1, ldsm2, 0, - // c_sm, f_sm)]); - // #else - // vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] -= prev_vec_sm * - // bm_sm[0]; - // #endif - vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] = tridiag_forward( - prev_vec_sm, bm_sm[0], vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)]); - // printf ("prev_vec_sm = %f\n", prev_vec_sm ); - // printf ("vec_sm[r_sm * ldsm + 0] = %f\n", vec_sm[r_sm * ldsm + 0] ); - } - //__syncthreads(); - - } else { - if (c_sm < c_rest && f_sm < f_rest) { - // #ifdef MGARD_X_FMA - // vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] = - // __fma_rn(prev_vec_sm, bm_sm[0], vec_sm[get_idx(ldsm1, ldsm2, 0, - // c_sm, f_sm)]); - // #else - // vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] -= prev_vec_sm * - // bm_sm[0]; - // #endif - vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] = tridiag_forward( - prev_vec_sm, bm_sm[0], vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)]); - for (int i = 1; i < r_ghost + r_rest; i++) { - // #ifdef MGARD_X_FMA - // vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)] = - // __fma_rn(vec_sm[get_idx(ldsm1, ldsm2, i - 1, c_sm, f_sm)], - // bm_sm[i], - // vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)]); - // #else - // vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)] -= - // vec_sm[get_idx(ldsm1, ldsm2, i - 1, c_sm, f_sm)] * bm_sm[i]; - // #endif - vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)] = tridiag_forward( - vec_sm[get_idx(ldsm1, ldsm2, i - 1, c_sm, f_sm)], bm_sm[i], - vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)]); - } - } - } - __syncthreads(); - /* flush results to v */ - if (c_sm < c_rest && f_sm < f_rest) { - for (int i = 0; i < r_ghost + r_rest; i++) { - vec[get_idx(ldv1, ldv2, r_gl + i, c_sm, f_sm)] = - vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)]; - // printf("c_stride = %d, c_sm = %d, vec_sm = %f, vec[%d] = - // %f\n",c_stride, c_sm, vec_sm[r_sm * ldsm + 0],i * row_stride * lddv + - // c_stride, vec[i * row_stride * lddv + c_stride]); - } - } - __syncthreads(); - - /* backward */ - T *am_sm = bm_sm; - r_rest = nr_c; - r_ghost = min(nr_c, G); - r_main = R; - r_gl = 0; - prev_vec_sm = 0.0; - - /* Load first ghost */ - if (c_sm < c_rest && f_sm < f_rest) { - for (int i = 0; i < r_ghost; i++) { - vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)] = - vec[get_idx(ldv1, ldv2, (nr_c - 1) - (r_gl + i), c_sm, f_sm)]; - // if (r_sm == 0) printf("r0_stride = %d, vec_sm[%d] = %f\n", r0_stride, - // i, vec_sm[i * ldsm + c_sm]); - } - } - - if (c_sm == 0 && r_sm < r_ghost) { - am_sm[r_sm] = am[(nr_c - 1) - (r_gl + r_sm)]; - dist_sm[r_sm] = dist_r[(nr_c - 1) - (r_gl + r_sm)]; - } - r_rest -= r_ghost; - __syncthreads(); - - while (r_rest > R - r_ghost) { - r_main = min(R, r_rest); - if (c_sm < c_rest && f_sm < f_rest) { - for (int i = 0; i < r_main; i++) { - vec_sm[get_idx(ldsm1, ldsm2, i + r_ghost, c_sm, f_sm)] = vec[get_idx( - ldv1, ldv2, (nr_c - 1) - (r_gl + i + r_ghost), c_sm, f_sm)]; - } - } - if (c_sm == 0 && r_sm < r_main) { - am_sm[r_sm + r_ghost] = am[(nr_c - 1) - (r_gl + r_sm + r_ghost)]; - dist_sm[r_sm + r_ghost] = dist_r[(nr_c - 1) - (r_gl + r_sm + r_ghost)]; - } - __syncthreads(); - - /* Computation of v in parallel*/ - if (c_sm < c_rest && f_sm < f_rest) { - // #ifdef MGARD_X_FMA - // vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] = - // __fma_rn(dist_sm[0], prev_vec_sm, vec_sm[get_idx(ldsm1, ldsm2, 0, - // c_sm, f_sm)]) * am_sm[0]; - // #else - // vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] = - // (vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] - dist_sm[0] * - // prev_vec_sm) / am_sm[0]; - // #endif - - vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] = - tridiag_backward(prev_vec_sm, dist_sm[0], am_sm[0], - vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)]); - for (int i = 1; i < R; i++) { - - // #ifdef MGARD_X_FMA - // vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)] = - // __fma_rn(dist_sm[i], vec_sm[get_idx(ldsm1, ldsm2, i - 1, c_sm, - // f_sm)], - // vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)]) * am_sm[i]; - // #else - // vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)] = - // (vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)] - - // dist_sm[i] * vec_sm[get_idx(ldsm1, ldsm2, i - 1, c_sm, - // f_sm)]) / am_sm[i]; - // #endif - - vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)] = tridiag_backward( - vec_sm[get_idx(ldsm1, ldsm2, i - 1, c_sm, f_sm)], dist_sm[i], - am_sm[i], vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)]); - } - - /* Store last v */ - prev_vec_sm = vec_sm[get_idx(ldsm1, ldsm2, R - 1, c_sm, f_sm)]; - } - __syncthreads(); - - /* flush results to v */ - if (c_sm < c_rest && f_sm < f_rest) { - for (int i = 0; i < R; i++) { - // if (blockIdx.x == 0 && blockIdx.y == 0 && threadIdx.x == 0 && - // threadIdx.y == 0) { - // printf("%d %d %d (%f) <- %d %d %d\n", (nr - 1) - (r_gl + i), c_sm, - // f_sm, - // vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)], i, c_sm, - // f_sm); - // } - vec[get_idx(ldv1, ldv2, (nr_c - 1) - (r_gl + i), c_sm, f_sm)] = - vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)]; - } - } - __syncthreads(); - - // /* Update unloaded col */ - r_rest -= r_main; - - /* Advance c */ - r_gl += R; - - /* Copy next ghost to main */ - r_ghost = min(G, r_main - (R - G)); - if (c_sm < c_rest && f_sm < f_rest) { - for (int i = 0; i < r_ghost; i++) { - vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)] = - vec_sm[get_idx(ldsm1, ldsm2, i + R, c_sm, f_sm)]; - } - } - if (c_sm == 0 && r_sm < r_ghost) { - am_sm[r_sm] = am_sm[r_sm + R]; - dist_sm[r_sm] = dist_sm[r_sm + R]; - } - __syncthreads(); - - } // end of while - - /* Load all rest col */ - if (c_sm < c_rest && f_sm < f_rest) { - for (int i = 0; i < r_rest; i++) { - vec_sm[get_idx(ldsm1, ldsm2, i + r_ghost, c_sm, f_sm)] = vec[get_idx( - ldv1, ldv2, (nr_c - 1) - (r_gl + i + r_ghost), c_sm, f_sm)]; - } - } - if (c_sm == 0 && r_sm < r_rest) { - am_sm[r_sm + r_ghost] = am[(nr_c - 1) - (r_gl + r_sm + r_ghost)]; - dist_sm[r_sm + r_ghost] = dist_r[(nr_c - 1) - (r_gl + r_sm + r_ghost)]; - } - __syncthreads(); - - /* Only 1 col remain */ - if (r_ghost + r_rest == 1) { - if (c_sm < c_rest && f_sm < f_rest) { - // #ifdef MGARD_X_FMA - // vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] = - // __fma_rn(dist_sm[0], prev_vec_sm, vec_sm[get_idx(ldsm1, ldsm2, 0, - // c_sm, f_sm)]) * am_sm[0]; - // #else - // vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] = - // (vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] - dist_sm[0] * - // prev_vec_sm) / am_sm[0]; - // #endif - // if (blockIdx.x == 0 && blockIdx.y == 0 && threadIdx.x == 0 && - // threadIdx.y == 0) { - // printf("backward 1 (%f) %f %f %f %f\n", tridiag_backward(prev_vec_sm, - // dist_sm[0], am_sm[0], - // vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)]), prev_vec_sm, - // dist_sm[0], am_sm[0], vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, - // f_sm)]); - - // } - vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] = - tridiag_backward(prev_vec_sm, dist_sm[0], am_sm[0], - vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)]); - // printf ("prev_vec_sm = %f\n", prev_vec_sm ); - // printf ("vec_sm[r_sm * ldsm + 0] = %f\n", vec_sm[r_sm * ldsm + 0] ); - } - //__syncthreads(); - - } else { - if (c_sm < c_rest && f_sm < f_rest) { - // #ifdef MGARD_X_FMA - // vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] = - // __fma_rn(dist_sm[0], prev_vec_sm, vec_sm[get_idx(ldsm1, ldsm2, 0, - // c_sm, f_sm)]) * am_sm[0]; - // #else - // vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] = - // (vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] - dist_sm[0] * - // prev_vec_sm) / am_sm[0]; - // #endif - // if (blockIdx.x == 0 && blockIdx.y == 0 && threadIdx.x == 0 && - // threadIdx.y == 0) { - // printf("backward 1 (%f) %f %f %f %f\n", tridiag_backward(prev_vec_sm, - // dist_sm[0], am_sm[0], - // vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)]), prev_vec_sm, - // dist_sm[0], am_sm[0], vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, - // f_sm)]); - - // } - - vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] = - tridiag_backward(prev_vec_sm, dist_sm[0], am_sm[0], - vec_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)]); - for (int i = 1; i < r_ghost + r_rest; i++) { - - // #ifdef MGARD_X_FMA - // vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)] = - // __fma_rn(dist_sm[i], vec_sm[get_idx(ldsm1, ldsm2, i - 1, c_sm, - // f_sm)], - // vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)]) * am_sm[i]; - // #else - // vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)] = - // (vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)] - - // dist_sm[i] * vec_sm[get_idx(ldsm1, ldsm2, i - 1, c_sm, - // f_sm)]) / am_sm[i]; - // #endif - // if (blockIdx.x == 0 && blockIdx.y == 0 && threadIdx.x == 0 && - // threadIdx.y == 0) { printf("backward R=%d (%f) %f %f %f %f\n", i, - // tridiag_backward(vec_sm[get_idx(ldsm1, ldsm2, i - 1, c_sm, f_sm)], - // dist_sm[i], am_sm[i], vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, - // f_sm)]), vec_sm[get_idx(ldsm1, ldsm2, i - 1, c_sm, f_sm)], - // dist_sm[i], am_sm[i], vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, - // f_sm)]); - - // } - - vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)] = tridiag_backward( - vec_sm[get_idx(ldsm1, ldsm2, i - 1, c_sm, f_sm)], dist_sm[i], - am_sm[i], vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)]); - } - } - } - __syncthreads(); - /* flush results to v */ - if (c_sm < c_rest && f_sm < f_rest) { - for (int i = 0; i < r_ghost + r_rest; i++) { - vec[get_idx(ldv1, ldv2, (nr_c - 1) - (r_gl + i), c_sm, f_sm)] = - vec_sm[get_idx(ldsm1, ldsm2, i, c_sm, f_sm)]; - // printf("c_stride = %d, c_sm = %d, vec_sm = %f, vec[%d] = - // %f\n",c_stride, c_sm, vec_sm[r_sm * ldsm + 0],i * row_stride * lddv + - // c_stride, vec[i * row_stride * lddv + c_stride]); - } - } - __syncthreads(); -} - -template -void ipk_3_3d_adaptive_launcher(Handle &handle, int nr_c, int nc_c, - int nf_c, T *am, T *bm, T *ddist_r, T *dv, - int lddv1, int lddv2, int queue_idx) { - - // printf("am: "); - // print_matrix_cuda(1, nr, am, nr); - // printf("bm: "); - // print_matrix_cuda(1, nr, bm, nr); - - int total_thread_x = nf_c; - int total_thread_y = nc_c; - int total_thread_z = 1; - int tbx, tby, tbz, gridx, gridy, gridz; - dim3 threadsPerBlock, blockPerGrid; - size_t sm_size; - - tbx = F;//std::max(F, std::min(F, total_thread_x)); - tby = C;//std::max(C, std::min(C, total_thread_y)); - tbz = 1; - sm_size = (C * F + 2) * (R + G) * sizeof(T); - gridx = ceil((float)total_thread_x / tbx); - gridy = ceil((float)total_thread_y / tby); - gridz = 1; - threadsPerBlock = dim3(tbx, tby, tbz); - blockPerGrid = dim3(gridx, gridy, gridz); - _ipk_3_3d<<>>( - nr_c, nc_c, nf_c, am, bm, ddist_r, dv, lddv1, lddv2); - gpuErrchk(cudaGetLastError()); - if (handle.sync_and_check_all_kernels) { - gpuErrchk(cudaDeviceSynchronize()); - } -} - -template -void ipk_3_3d(Handle &handle, int nr_c, int nc_c, int nf_c, T *am, T *bm, - T *ddist_r, T *dv, int lddv1, int lddv2, int queue_idx, - int config) { - -#define IPK(R, C, F, G) \ - { \ - ipk_3_3d_adaptive_launcher(handle, nr_c, nc_c, nf_c, am, \ - bm, ddist_r, dv, lddv1, \ - lddv2, queue_idx); \ - } - - bool profile = false; - if (handle.profile_kernels) { - profile = true; - } - if (D == 3) { - if (profile || config == 6) { - IPK(2, 2, 128, 2) - } - if (profile || config == 5) { - IPK(2, 2, 64, 2) - } - if (profile || config == 4) { - IPK(2, 2, 32, 2) - } - if (profile || config == 3) { - IPK(2, 2, 16, 2) - } - if (profile || config == 2) { - IPK(8, 8, 8, 4) - } - if (profile || config == 1) { - IPK(4, 4, 4, 4) - } - if (profile || config == 0) { - IPK(2, 2, 2, 2) - } - } else { - printf("Error: ipk_3_3d is only for 3D data\n"); - } -#undef IPK -} -#endif -} // namespace mgard_x - -#endif \ No newline at end of file diff --git a/include/mgard-x/DataRefactoring/MultiDimension/Correction/LevelwiseProcessingKernel.h b/include/mgard-x/DataRefactoring/MultiDimension/Correction/LevelwiseProcessingKernel.h deleted file mode 100644 index 3495c8a780..0000000000 --- a/include/mgard-x/DataRefactoring/MultiDimension/Correction/LevelwiseProcessingKernel.h +++ /dev/null @@ -1,26 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#ifndef MGARD_X_LEVELWISE_PROCESSING_KERNEL -#define MGARD_X_LEVELWISE_PROCESSING_KERNEL - -#include "Common.h" - -namespace mgard_x { - -template -void lwpk(Handle &handle, thrust::device_vector shape, T *dv, - thrust::device_vector ldvs, T *dwork, - thrust::device_vector ldws, int queue_idx); - -template -void lwpk(Handle &handle, SIZE *shape_h, SIZE *shape_d, T *dv, SIZE *ldvs, - T *dwork, SIZE *ldws, int queue_idx); - -} // namespace mgard_x - -#endif \ No newline at end of file diff --git a/include/mgard-x/DataRefactoring/MultiDimension/Correction/LevelwiseProcessingKernel.hpp b/include/mgard-x/DataRefactoring/MultiDimension/Correction/LevelwiseProcessingKernel.hpp index c2d7f3fc77..607f2c650c 100644 --- a/include/mgard-x/DataRefactoring/MultiDimension/Correction/LevelwiseProcessingKernel.hpp +++ b/include/mgard-x/DataRefactoring/MultiDimension/Correction/LevelwiseProcessingKernel.hpp @@ -126,15 +126,22 @@ class LwpkReo : public AutoTuner { int queue_idx) { int range_l = std::min(6, (int)std::log2(v.getShape(0)) - 1); - int arch = DeviceRuntime::GetArchitectureGeneration(); int prec = TypeToIdx(); + int config = AutoTuner::autoTuningTable.lwpk[prec][range_l]; + + while (LWPK_CONFIG[D - 1][config][0] * LWPK_CONFIG[D - 1][config][1] * + LWPK_CONFIG[D - 1][config][2] > + DeviceRuntime::GetMaxNumThreadsPerTB()) { + config--; + if (config < 0) { + std::cout << log::log_err + << "Cannot find suitable config for LwpkReo.\n"; + } + } double min_time = std::numeric_limits::max(); int min_config = 0; - // int config = 0; - int config = AutoTuner::autoTuningTable.lwpk[prec][range_l]; - #define LWPK(CONFIG) \ if (config == CONFIG || AutoTuner::ProfileKernels) { \ const int R = LWPK_CONFIG[D - 1][CONFIG][0]; \ @@ -168,117 +175,6 @@ class LwpkReo : public AutoTuner { } }; -// template -// __global__ void _lwpk(SIZE *shape, T *dv, SIZE *ldvs, T *dwork, SIZE *ldws) { - -// size_t threadId = (threadIdx.z * (blockDim.x * blockDim.y)) + -// (threadIdx.y * blockDim.x) + threadIdx.x; -// SIZE *sm = SharedMemory(); -// SIZE *shape_sm = sm; -// SIZE *ldvs_sm = shape_sm + D; -// SIZE *ldws_sm = ldvs_sm + D; - -// if (threadId < D) { -// shape_sm[threadId] = shape[threadId]; -// ldvs_sm[threadId] = ldvs[threadId]; -// ldws_sm[threadId] = ldws[threadId]; -// } -// __syncthreads(); - -// SIZE idx[D]; -// SIZE firstD = div_roundup(shape_sm[0], F); - -// SIZE bidx = FunctorBase::GetBlockIdX(); -// idx[0] = (bidx % firstD) * F + threadIdx.x; - -// // printf("firstD %d idx[0] %d\n", firstD, idx[0]); - -// bidx /= firstD; -// if (D >= 2) -// idx[1] = blockIdx.y * blockDim.y + threadIdx.y; -// if (D >= 3) -// idx[2] = blockIdx.z * blockDim.z + threadIdx.z; - -// for (DIM d = 3; d < D; d++) { -// idx[d] = bidx % shape_sm[d]; -// bidx /= shape_sm[d]; -// } -// // int z = blockIdx.z * blockDim.z + threadIdx.z; -// // int y = blockIdx.y * blockDim.y + threadIdx.y; -// // int x = blockIdx.z * blockDim.z + threadIdx.z; -// bool in_range = true; -// for (DIM d = 0; d < D; d++) { -// if (idx[d] >= shape_sm[d]) -// in_range = false; -// } -// if (in_range) { -// // printf("%d %d %d %d\n", idx[3], idx[2], idx[1], idx[0]); -// if (OP == COPY) -// dwork[get_idx(ldws, idx)] = dv[get_idx(ldvs, idx)]; -// if (OP == ADD) -// dwork[get_idx(ldws, idx)] += dv[get_idx(ldvs, idx)]; -// if (OP == SUBTRACT) -// dwork[get_idx(ldws, idx)] -= dv[get_idx(ldvs, idx)]; -// } -// } - -// template -// void lwpk_adaptive_launcher(Handle &handle, SIZE *shape_h, SIZE -// *shape_d, -// T *dv, SIZE *ldvs, T *dwork, SIZE *ldws, -// int queue_idx) { - -// SIZE total_thread_z = shape_h[2]; -// SIZE total_thread_y = shape_h[1]; -// SIZE total_thread_x = shape_h[0]; -// // linearize other dimensions -// SIZE tbz = R; -// SIZE tby = C; -// SIZE tbx = F; -// SIZE gridz = ceil((float)total_thread_z / tbz); -// SIZE gridy = ceil((float)total_thread_y / tby); -// SIZE gridx = ceil((float)total_thread_x / tbx); -// for (DIM d = 3; d < D; d++) { -// gridx *= shape_h[d]; -// } - -// // printf("exec: %d %d %d %d %d %d\n", tbx, tby, tbz, gridx, gridy, gridz); -// dim3 threadsPerBlock(tbx, tby, tbz); -// dim3 blockPerGrid(gridx, gridy, gridz); -// size_t sm_size = (D * 3) * sizeof(SIZE); -// _lwpk<<>>( -// shape_d, dv, ldvs, dwork, ldws); - -// gpuErrchk(cudaGetLastError()); -// if (handle.sync_and_check_all_kernels) { -// gpuErrchk(cudaDeviceSynchronize()); -// } -// } - -// template -// void lwpk(Handle &handle, SIZE *shape_h, SIZE *shape_d, T *dv, SIZE -// *ldvs, -// T *dwork, SIZE *ldws, int queue_idx) { -// #define COPYLEVEL(R, C, F) \ -// { \ -// lwpk_adaptive_launcher(handle, shape_h, shape_d, dv, \ -// ldvs, dwork, ldws, queue_idx); -// \ -// } -// if (D >= 3) { -// COPYLEVEL(4, 4, 4) -// } -// if (D == 2) { -// COPYLEVEL(1, 4, 4) -// } -// if (D == 1) { -// COPYLEVEL(1, 1, 8) -// } - -// #undef COPYLEVEL -// } - template class LevelwiseCalcNDFunctor : public Functor { diff --git a/include/mgard-x/DataRefactoring/MultiDimension/Correction/LinearProcessingKernel.h b/include/mgard-x/DataRefactoring/MultiDimension/Correction/LinearProcessingKernel.h deleted file mode 100644 index 0d5dde4b48..0000000000 --- a/include/mgard-x/DataRefactoring/MultiDimension/Correction/LinearProcessingKernel.h +++ /dev/null @@ -1,44 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#ifndef MGARD_X_LINEAR_PROCESSING_KERNEL -#define MGARD_X_LINEAR_PROCESSING_KERNEL - -#include "../../Common.h" - -namespace mgard_x { - -template -void lpk_reo_1(Handle &handle, SIZE *shape_h, SIZE *shape_c_h, - SIZE *shape_d, SIZE *shape_c_d, SIZE *ldvs, SIZE *ldws, - DIM processed_n, DIM *processed_dims_h, DIM *processed_dims_d, - DIM curr_dim_r, DIM curr_dim_c, DIM curr_dim_f, T *ddist_f, - T *dratio_f, T *dv1, LENGTH lddv11, LENGTH lddv12, T *dv2, - LENGTH lddv21, LENGTH lddv22, T *dw, LENGTH lddw1, LENGTH lddw2, - int queue_idx, int config); - -template -void lpk_reo_2(Handle &handle, SIZE *shape_h, SIZE *shape_c_h, - SIZE *shape_d, SIZE *shape_c_d, SIZE *ldvs, SIZE *ldws, - DIM processed_n, DIM *processed_dims_h, DIM *processed_dims_d, - DIM curr_dim_r, DIM curr_dim_c, DIM curr_dim_f, T *ddist_c, - T *dratio_c, T *dv1, LENGTH lddv11, LENGTH lddv12, T *dv2, - LENGTH lddv21, LENGTH lddv22, T *dw, LENGTH lddw1, LENGTH lddw2, - int queue_idx, int config); - -template -void lpk_reo_3(Handle &handle, SIZE *shape_h, SIZE *shape_c_h, - SIZE *shape_d, SIZE *shape_c_d, SIZE *ldvs, SIZE *ldws, - DIM processed_n, DIM *processed_dims_h, DIM *processed_dims_d, - DIM curr_dim_r, DIM curr_dim_c, DIM curr_dim_f, T *ddist_r, - T *dratio_r, T *dv1, LENGTH lddv11, LENGTH lddv12, T *dv2, - LENGTH lddv21, LENGTH lddv22, T *dw, LENGTH lddw1, LENGTH lddw2, - int queue_idx, int config); - -} // namespace mgard_x - -#endif \ No newline at end of file diff --git a/include/mgard-x/DataRefactoring/MultiDimension/Correction/LinearProcessingKernel.hpp b/include/mgard-x/DataRefactoring/MultiDimension/Correction/LinearProcessingKernel.hpp index 909e5b5a2e..99bdcf2394 100644 --- a/include/mgard-x/DataRefactoring/MultiDimension/Correction/LinearProcessingKernel.hpp +++ b/include/mgard-x/DataRefactoring/MultiDimension/Correction/LinearProcessingKernel.hpp @@ -552,14 +552,11 @@ class Lpk1Reo : public AutoTuner { SubArray v1, SubArray v2, SubArray w, int queue_idx) { int range_l = std::min(6, (int)std::log2(shape.dataHost()[curr_dim_f]) - 1); - int arch = DeviceRuntime::GetArchitectureGeneration(); int prec = TypeToIdx(); - // int config = - // AutoTuner::autoTuningTable.auto_tuning_mr1[arch][prec][range_l]; int config = AutoTuner::autoTuningTable.lpk1_nd[prec][range_l]; - double min_time = std::numeric_limits::max(); int min_config = 0; + ExecutionReturn ret; #define LPK(CONFIG) \ if (config == CONFIG || AutoTuner::ProfileKernels) { \ @@ -572,22 +569,26 @@ class Lpk1Reo : public AutoTuner { shape, shape_c, processed_n, processed_dims, curr_dim_r, curr_dim_c, \ curr_dim_f, dist_f, ratio_f, v1, v2, w, queue_idx); \ DeviceAdapter adapter; \ - ExecutionReturn ret = adapter.Execute(task); \ + ret = adapter.Execute(task); \ if (AutoTuner::ProfileKernels) { \ - if (min_time > ret.execution_time) { \ + if (ret.success && min_time > ret.execution_time) { \ min_time = ret.execution_time; \ min_config = CONFIG; \ } \ } \ } - LPK(0) - LPK(1) - LPK(2) - LPK(3) - LPK(4) - LPK(5) - LPK(6) + LPK(6) if (!ret.success) config--; + LPK(5) if (!ret.success) config--; + LPK(4) if (!ret.success) config--; + LPK(3) if (!ret.success) config--; + LPK(2) if (!ret.success) config--; + LPK(1) if (!ret.success) config--; + LPK(0) if (!ret.success) config--; + if (config < 0 && !ret.success) { + std::cout << log::log_err << "no suitable config for Lpk1Reo.\n"; + exit(-1); + } #undef LPK if (AutoTuner::ProfileKernels) { @@ -1044,12 +1045,10 @@ class Lpk2Reo : public AutoTuner { std::min(6, (int)std::log2(shape_c.dataHost()[curr_dim_f]) - 1); int arch = DeviceRuntime::GetArchitectureGeneration(); int prec = TypeToIdx(); - // int config = - // AutoTuner::autoTuningTable.auto_tuning_mr2[arch][prec][range_l]; int config = AutoTuner::autoTuningTable.lpk2_nd[prec][range_l]; - double min_time = std::numeric_limits::max(); int min_config = 0; + ExecutionReturn ret; #define LPK(CONFIG) \ if (config == CONFIG || AutoTuner::ProfileKernels) { \ @@ -1062,22 +1061,26 @@ class Lpk2Reo : public AutoTuner { shape, shape_c, processed_n, processed_dims, curr_dim_r, curr_dim_c, \ curr_dim_f, dist_c, ratio_c, v1, v2, w, queue_idx); \ DeviceAdapter adapter; \ - ExecutionReturn ret = adapter.Execute(task); \ + ret = adapter.Execute(task); \ if (AutoTuner::ProfileKernels) { \ - if (min_time > ret.execution_time) { \ + if (ret.success && min_time > ret.execution_time) { \ min_time = ret.execution_time; \ min_config = CONFIG; \ } \ } \ } - LPK(0) - LPK(1) - LPK(2) - LPK(3) - LPK(4) - LPK(5) - LPK(6) + LPK(6) if (!ret.success) config--; + LPK(5) if (!ret.success) config--; + LPK(4) if (!ret.success) config--; + LPK(3) if (!ret.success) config--; + LPK(2) if (!ret.success) config--; + LPK(1) if (!ret.success) config--; + LPK(0) if (!ret.success) config--; + if (config < 0 && !ret.success) { + std::cout << log::log_err << "no suitable config for Lpk2Reo.\n"; + exit(-1); + } #undef LPK if (AutoTuner::ProfileKernels) { @@ -1547,12 +1550,10 @@ class Lpk3Reo : public AutoTuner { std::min(6, (int)std::log2(shape_c.dataHost()[curr_dim_f]) - 1); int arch = DeviceRuntime::GetArchitectureGeneration(); int prec = TypeToIdx(); - // int config = - // AutoTuner::autoTuningTable.auto_tuning_mr3[arch][prec][range_l]; int config = AutoTuner::autoTuningTable.lpk3_nd[prec][range_l]; - double min_time = std::numeric_limits::max(); int min_config = 0; + ExecutionReturn ret; #define LPK(CONFIG) \ if (config == CONFIG || AutoTuner::ProfileKernels) { \ @@ -1565,22 +1566,26 @@ class Lpk3Reo : public AutoTuner { shape, shape_c, processed_n, processed_dims, curr_dim_r, curr_dim_c, \ curr_dim_f, dist_r, ratio_r, v1, v2, w, queue_idx); \ DeviceAdapter adapter; \ - ExecutionReturn ret = adapter.Execute(task); \ + ret = adapter.Execute(task); \ if (AutoTuner::ProfileKernels) { \ - if (min_time > ret.execution_time) { \ + if (ret.success && min_time > ret.execution_time) { \ min_time = ret.execution_time; \ min_config = CONFIG; \ } \ } \ } - LPK(0) - LPK(1) - LPK(2) - LPK(3) - LPK(4) - LPK(5) - LPK(6) + LPK(6) if (!ret.success) config--; + LPK(5) if (!ret.success) config--; + LPK(4) if (!ret.success) config--; + LPK(3) if (!ret.success) config--; + LPK(2) if (!ret.success) config--; + LPK(1) if (!ret.success) config--; + LPK(0) if (!ret.success) config--; + if (config < 0 && !ret.success) { + std::cout << log::log_err << "no suitable config for Lpk3Reo.\n"; + exit(-1); + } #undef LPK if (AutoTuner::ProfileKernels) { @@ -1589,1434 +1594,6 @@ class Lpk3Reo : public AutoTuner { } }; -// template -// __global__ void -// _lpk_reo_1(SIZE *shape, SIZE *shape_c, SIZE *ldvs, SIZE *ldws, DIM -// processed_n, -// DIM *processed_dims, DIM curr_dim_r, DIM curr_dim_c, DIM -// curr_dim_f, T *ddist_f, T *dratio_f, T *dv1, LENGTH lddv11, LENGTH -// lddv12, T *dv2, LENGTH lddv21, LENGTH lddv22, T *dw, LENGTH lddw1, -// LENGTH lddw2) { - -// // bool debug = false; -// // if (blockIdx.z == 0 && blockIdx.y == 1 && blockIdx.x == 1 && -// // threadIdx.z == 0 && threadIdx.y == 0 ) debug = false; - -// // bool debug = false; -// // if (threadIdx.z == 0 && threadIdx.y == 0 && threadIdx.x == 0 ) debug = -// // true; - -// LENGTH threadId = (threadIdx.z * (blockDim.x * blockDim.y)) + -// (threadIdx.y * blockDim.x) + threadIdx.x; - -// T *sm = SharedMemory(); -// SIZE ldsm1 = F * 2 + 3; -// SIZE ldsm2 = C; -// T *v_sm = sm; sm += ldsm1 * ldsm2 * R; - -// T *dist_f_sm = sm; sm += ldsm1; -// T *ratio_f_sm = sm; sm += ldsm1; - -// SIZE * sm_size = (SIZE*)sm; -// SIZE *shape_sm = sm_size; sm_size += D; -// SIZE *shape_c_sm = sm_size; sm_size += D; -// SIZE *ldvs_sm = sm_size; sm_size += D; -// SIZE *ldws_sm = sm_size; sm_size += D; -// sm = (T*)sm_size; - -// DIM * sm_dim = (DIM*)sm; -// DIM *processed_dims_sm = sm_dim; sm_dim += D; -// sm = (T*)sm_dim; - -// SIZE idx[D]; -// if (threadId < D) { -// shape_sm[threadId] = shape[threadId]; -// shape_c_sm[threadId] = shape_c[threadId]; -// ldvs_sm[threadId] = ldvs[threadId]; -// ldws_sm[threadId] = ldws[threadId]; -// } -// if (threadId < processed_n) { -// processed_dims_sm[threadId] = processed_dims[threadId]; -// } -// __syncthreads(); - -// for (DIM d = 0; d < D; d++) -// idx[d] = 0; - -// SIZE nr = shape_sm[curr_dim_r]; -// SIZE nc = shape_sm[curr_dim_c]; -// SIZE nf = shape_sm[curr_dim_f]; -// SIZE nf_c = shape_c_sm[curr_dim_f]; - -// bool zero_other = true; -// bool PADDING = (nf % 2 == 0); - -// SIZE bidx = blockIdx.x; -// SIZE firstD; -// if (nf_c % 2 == 1) { -// firstD = div_roundup(nf_c, blockDim.x); -// } else { -// firstD = div_roundup(nf_c, blockDim.x); -// } -// SIZE blockId = bidx % firstD; -// bidx /= firstD; - -// for (DIM d = 0; d < D; d++) { -// if (d != curr_dim_r && d != curr_dim_c && d != curr_dim_f) { -// SIZE t = shape_sm[d]; -// for (DIM k = 0; k < processed_n; k++) { -// if (d == processed_dims[k]) { -// t = shape_c_sm[d]; -// } -// } -// idx[d] = bidx % t; -// bidx /= t; -// if (idx[d] >= shape_c_sm[d]) -// zero_other = false; -// } -// } - -// SIZE zero_r = shape_c_sm[curr_dim_r]; -// SIZE zero_c = shape_c_sm[curr_dim_c]; -// SIZE zero_f = shape_c_sm[curr_dim_f]; - -// if (D < 3) { -// nr = 1; -// zero_r = 1; -// } -// if (D < 2) { -// nc = 1; -// zero_c = 1; -// } - -// LENGTH other_offset_v = get_idx(ldvs_sm, idx); -// LENGTH other_offset_w = get_idx(ldws_sm, idx); - -// dv1 = dv1 + other_offset_v; -// dv2 = dv2 + other_offset_v; -// dw = dw + other_offset_w; - -// // if (debug2) { -// // printf("idx: %d %d %d %d\n", idx[3], idx[2], idx[1], idx[0]); -// // printf("other_offset_v: %llu\n", other_offset_v); -// // printf("other_offset_w: %llu\n", other_offset_w); -// // } -// SIZE r_gl = blockIdx.z * blockDim.z + threadIdx.z; -// SIZE c_gl = blockIdx.y * blockDim.y + threadIdx.y; -// SIZE f_gl = blockId * blockDim.x + threadIdx.x; - -// SIZE r_sm = threadIdx.z; -// SIZE c_sm = threadIdx.y; -// SIZE f_sm = threadIdx.x; - -// SIZE actual_F = F; -// if (nf_c - blockId * blockDim.x < F) { -// actual_F = nf_c - blockId * blockDim.x; -// } - -// // if (nf_c % 2 == 1){ -// // if(nf_c-1 - blockId * blockDim.x < F) { actual_F = nf_c - 1 - blockId -// * -// // blockDim.x; } -// // } else { -// // if(nf_c - blockId * blockDim.x < F) { actual_F = nf_c - blockId * -// // blockDim.x; } -// // } - -// // if (debug) printf("actual_F %d\n", actual_F); - -// if (r_gl < nr && c_gl < nc && f_gl < nf_c) { -// if (zero_other && r_gl < zero_r && c_gl < zero_c && f_gl < zero_f) { -// // if (debug) printf("load left vsm[%d]: 0.0\n", f_sm * 2 + 2); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2 + 2)] = 0.0; -// } else { -// // if (debug) printf("load left vsm[%d]<-dv1[%d, %d, %d]: %f\n", f_sm * -// 2 -// // + 2, r_gl, c_gl, f_gl, dv1[get_idx(lddv11, lddv12, r_gl, c_gl, -// f_gl)]); v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2 + 2)] = -// dv1[get_idx(lddv11, lddv12, r_gl, c_gl, f_gl)]; -// } - -// if (f_sm == actual_F - 1) { -// if (zero_other && r_gl < zero_r && c_gl < zero_c && f_gl < zero_f) { -// // if (debug) printf("load left+1 vsm[%d]: 0.0\n", actual_F * 2 + 2); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, actual_F * 2 + 2)] = 0.0; -// } else { -// if (f_gl + 1 < nf_c) { -// // if (debug) printf("load left+1 vsm[%d]: %f\n", actual_F * 2 + 2, -// // dv1[get_idx(lddv11, lddv12, r_gl, c_gl, f_gl + 1)]); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, actual_F * 2 + 2)] = -// dv1[get_idx(lddv11, lddv12, r_gl, c_gl, f_gl + 1)]; -// } else { -// // if (debug) printf("load left+1 vsm[%d]: 0.0\n", actual_F * 2 + -// 2); v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, actual_F * 2 + 2)] = -// 0.0; -// } -// } -// } - -// if (f_sm == 0) { -// // left -// if (zero_other && r_gl < zero_r && c_gl < zero_c && f_gl < zero_f) { -// // coarse (-1) -// // if (debug) printf("load left-1 vsm[0]: 0.0\n"); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] = 0.0; -// } else { -// if (f_gl >= 1) { -// // other (-1) -// // if (debug) printf("load left-1 vsm[0]: %f\n", -// dv1[get_idx(lddv11, -// // lddv12, r_gl, c_gl, f_gl-1)]); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] = -// dv1[get_idx(lddv11, lddv12, r_gl, c_gl, f_gl - 1)]; -// } else { -// // other (-1) -// // if (debug) printf("load left-1 vsm[0]: 0.0\n"); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] = 0.0; -// } -// } -// } - -// // right -// if (!PADDING) { //other = nf_c - 1 -// if (nf_c % 2 != 0) { -// if (f_gl >= 1 && f_gl < nf_c) { //shift for better memory access -// pattern -// // if (debug) printf("load right vsm[%d]: %f <- %d %d %d\n", f_sm * -// 2 -// // + 1, dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl - 1)], r_gl, -// // c_gl, f_gl - 1); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2 + 1)] = -// dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl - 1)]; -// } else { -// // if (debug) printf("load right vsm[%d]: 0\n", f_sm * 2 + 1); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2 + 1)] = 0.0; -// } -// } else { // nf_c % 2 == 0, do not shift -// if (f_gl < nf_c - 1) { -// // if (debug) printf("load right vsm[%d]: %f <- %d %d %d\n", f_sm * -// 2 -// // + 3, dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl)], r_gl, c_gl, -// // f_gl); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2 + 3)] = -// dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl)]; -// } else { -// // if (debug) printf("load right vsm[%d]: 0\n", f_sm * 2 + 3); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2 + 3)] = 0.0; -// } -// } -// } else { // PADDING other = nf_c - 2 -// if (nf_c % 2 != 0) { -// if (f_gl >= 1 && f_gl < nf_c - 1) { //shift for better memory access -// pattern -// // if (debug) printf("load right vsm[%d]: %f <- %d %d %d\n", f_sm * -// 2 -// // + 1, dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl - 1)], r_gl, -// // c_gl, f_gl - 1); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2 + 1)] = -// dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl - 1)]; -// } else { -// // if (debug) printf("load right vsm[%d]: 0\n", f_sm * 2 + 1); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2 + 1)] = 0.0; -// } -// } else { // nf_c % 2 == 0 -// if (f_gl < nf_c - 2) { // do not shift -// // if (debug) printf("load right vsm[%d]: %f <- %d %d %d\n", f_sm * -// 2 -// // + 3, dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl)], r_gl, c_gl, -// // f_gl); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2 + 3)] = -// dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl)]; -// } else { -// // if (debug) printf("load right vsm[%d]: 0\n", f_sm * 2 + 3); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2 + 3)] = 0.0; -// } -// } -// } - -// if (f_sm == actual_F - 1) { -// // right (+1) -// if (!PADDING) { -// if (nf_c % 2 != 0) { -// if (f_gl < nf_c - 1) { -// // if (debug) printf("load right+1 vsm[%d]: %f <- %d %d %d\n", -// // actual_F * 2 + 1, dv2[get_idx(lddv21, lddv22, r_gl, c_gl, -// f_gl)], -// // r_gl, c_gl, f_gl); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, actual_F * 2 + 1)] = -// dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl)]; -// } else { -// // if (debug) printf("load right+1 vsm[%d]: 0.0\n", actual_F * 2 -// + -// // 1); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, actual_F * 2 + 1)] = 0.0; -// } -// } else { // nf_c % 2 == 0 -// if (f_gl >= actual_F) { -// // if (debug) printf("load right-1 vsm[1]: %f <- %d %d %d\n", -// // dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl - actual_F)], -// r_gl, -// // c_gl, f_gl - actual_F); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 1)] = -// dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl - actual_F)]; -// } else { -// // if (debug) printf("load right-1 vsm[1]: 0.0\n"); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 1)] = 0.0; -// } -// } -// } else { -// if (nf_c % 2 != 0) { -// if (f_gl < nf_c - 2) { -// // if (debug) printf("actual_F(%d), load right+1 vsm[%d]: %f <- -// %d %d %d\n", -// // actual_F, actual_F * 2 + 1, -// dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl)], -// r_gl, c_gl, f_gl); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, actual_F * 2 + 1)] = -// dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl)]; -// } else { -// // if (debug) printf("load right+1 vsm[%d]: 0.0\n", actual_F * 2 -// + -// // 1); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, actual_F * 2 + 1)] = 0.0; -// } -// } else { // nf_c % 2 == 0 -// if (f_gl >= actual_F && f_gl - actual_F < nf_c - 2) { -// // if (debug) printf("load right-1 vsm[1]: %f <- %d %d %d\n", -// // dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl - actual_F)], -// r_gl, -// // c_gl, f_gl - actual_F); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 1)] = -// dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl - actual_F)]; -// } else { -// // if (debug) printf("load right-1 vsm[1]: 0.0\n"); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 1)] = 0.0; -// } -// } -// } -// } -// } - -// // if (debug) printf("actual_F: %d\n", actual_F); -// if (r_sm == 0 && c_sm == 0 && f_sm < actual_F) { -// // if (debug) printf("blockId * F * 2 + f_sm = %d\n", blockId * F * 2 + -// f_sm); if (blockId * F * 2 + f_sm < nf) { // padding: num of dist == nf, -// non-padding: non of dist == nf - 1 -// // if (debug) printf("load dist/ratio1[%d]: %f <- %d\n", 2 + f_sm, -// ddist_f[blockId * F * 2 + f_sm], blockId * F * 2 + f_sm); dist_f_sm[2 + -// f_sm] = ddist_f[blockId * F * 2 + f_sm]; ratio_f_sm[2 + f_sm] = -// dratio_f[blockId * F * 2 + f_sm]; -// } else { -// // if (debug) printf("load dist/ratio1[%d]: 0.0\n", 2 + f_sm); -// dist_f_sm[2 + f_sm] = 0.0; -// ratio_f_sm[2 + f_sm] = 0.0; -// } - -// if (blockId * F * 2 + actual_F + f_sm < nf) { -// // if (debug) printf("load dist/ratio2[%d]: %f <- %d\n", 2 + actual_F + -// f_sm, ddist_f[blockId * F * 2 + actual_F + f_sm], blockId * F * 2 + -// actual_F + f_sm); dist_f_sm[2 + actual_F + f_sm] = -// ddist_f[blockId * F * 2 + actual_F + f_sm]; -// ratio_f_sm[2 + actual_F + f_sm] = -// dratio_f[blockId * F * 2 + actual_F + f_sm]; -// } else { -// // if (debug) printf("load dist/ratio2[%d]: 0.0\n", 2 + actual_F + -// f_sm); dist_f_sm[2 + actual_F + f_sm] = 0.0; ratio_f_sm[2 + actual_F + -// f_sm] = 0.0; -// } -// } - -// if (blockId > 0) { -// if (f_sm < 2) { -// // dist_f_sm[f_sm] = ddist_f[f_gl - 2]; -// // ratio_f_sm[f_sm] = dratio_f[f_gl - 2]; -// // if (debug) printf("load dist/ratio-1[%d]: %f <- %d\n", f_sm, -// ddist_f[blockId * F * 2 + f_sm - 2], blockId * F * 2 + f_sm - 2); -// dist_f_sm[f_sm] = ddist_f[blockId * F * 2 + f_sm - 2]; -// ratio_f_sm[f_sm] = dratio_f[blockId * F * 2 + f_sm - 2]; -// } -// } else { -// if (f_sm < 2) { -// // if (debug) printf("load dist/ratio-1[%d]: 0.0 <- %d\n", f_sm); -// dist_f_sm[f_sm] = 0.0; -// ratio_f_sm[f_sm] = 0.0; -// } -// } - -// __syncthreads(); - -// if (r_gl < nr && c_gl < nc && f_gl < nf_c) { -// T h1 = dist_f_sm[f_sm * 2]; -// T h2 = dist_f_sm[f_sm * 2 + 1]; -// T h3 = dist_f_sm[f_sm * 2 + 2]; -// T h4 = dist_f_sm[f_sm * 2 + 3]; -// T r1 = ratio_f_sm[f_sm * 2]; -// T r2 = ratio_f_sm[f_sm * 2 + 1]; -// T r3 = ratio_f_sm[f_sm * 2 + 2]; -// T r4 = 1 - r3; -// T a = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2)]; -// T b = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2 + 1)]; -// T c = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2 + 2)]; -// T d = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2 + 3)]; -// T e = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2 + 4)]; - -// // bool debug = false; -// // if (idx[3] == 0) debug = false; -// // if (debug) { -// // printf("f_sm(%d) %f %f %f %f %f f_sm_h %f %f %f %f f_sm_r %f %f %f -// %f, out: %f\n",f_sm, a,b,c,d,e, h1,h2,h3,h4,r1,r2,r3,r4, mass_trans(a, b, -// c, d, e, h1, h2, h3, h4, r1, r2, r3, r4)); -// // } - -// // T tb = a * h1/6 + b * (h1+h2)/3 + c * h2/6; -// // T tc = b * h2/6 + c * (h2+h3)/3 + d * h3/6; -// // T td = c * h3/6 + d * (h3+h4)/3 + e * h4/6; - -// // if (debug) printf("f_sm(%d) tb tc td tc: %f %f %f %f\n", f_sm, tb, tc, -// // td, tc+tb * r1 + td * r4); - -// // tc += tb * r1 + td * r4; - -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] = -// mass_trans(a, b, c, d, e, h1, h2, h3, h4, r1, r2, r3, r4); - -// // if (debug) printf("store[%d %d %d] %f \n", r_gl, c_gl, f_gl, -// // mass_trans(a, b, c, d, e, h1, h2, h3, h4, r1, r2, r3, r4)); - -// // printf("test block %d F %d nf %d\n", blockId, F, nf); -// // if (f_gl+1 == nf_c-1) { - -// // // T te = h4 * d + 2 * h4 * e; -// // //printf("f_sm(%d) mm-e: %f\n", f_sm, te); -// // // te += td * r3; -// // dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl+1)] = -// // mass_trans(c, d, e, (T)0.0, (T)0.0, h1, h2, (T)0.0, (T)0.0, r1, -// r2, -// // (T)0.0, (T)0.0); -// // } -// } -// } - -// template -// void lpk_reo_1_adaptive_launcher(Handle &handle, SIZE *shape_h, -// SIZE *shape_c_h, SIZE *shape_d, SIZE -// *shape_c_d, SIZE *ldvs, SIZE *ldws, DIM -// processed_n, DIM *processed_dims_h, DIM -// *processed_dims_d, DIM curr_dim_r, DIM -// curr_dim_c, DIM curr_dim_f, T *ddist_f, T -// *dratio_f, T *dv1, LENGTH lddv11, LENGTH -// lddv12, T *dv2, LENGTH lddv21, LENGTH -// lddv22, T *dw, LENGTH lddw1, LENGTH lddw2, -// int queue_idx) { -// SIZE nr = shape_h[curr_dim_r]; -// SIZE nc = shape_h[curr_dim_c]; -// SIZE nf = shape_h[curr_dim_f]; -// SIZE nf_c = shape_c_h[curr_dim_f]; - -// SIZE total_thread_z = nr; -// SIZE total_thread_y = nc; -// SIZE total_thread_x = nf_c; -// // if (nf_c % 2 == 1) { total_thread_x = nf_c - 1; } -// // else { total_thread_x = nf_c; } -// SIZE tbx, tby, tbz, gridx, gridy, gridz; -// dim3 threadsPerBlock, blockPerGrid; -// size_t sm_size; - -// tbz = R; -// tby = C; -// tbx = F; -// sm_size = (R * C * (F * 2 + 3) + (F * 2 + 3) * 2) * sizeof(T); -// sm_size += (D * 4) * sizeof(SIZE); -// sm_size += (D * 1) * sizeof(DIM); -// gridz = ceil((float)total_thread_z / tbz); -// gridy = ceil((float)total_thread_y / tby); -// gridx = ceil((float)total_thread_x / tbx); -// for (DIM d = 0; d < D; d++) { -// if (d != curr_dim_f && d != curr_dim_c && d != curr_dim_r) { -// SIZE t = shape_h[d]; -// for (DIM k = 0; k < processed_n; k++) { -// if (d == processed_dims_h[k]) { -// t = shape_c_h[d]; -// } -// } -// gridx *= t; -// } -// } -// threadsPerBlock = dim3(tbx, tby, tbz); -// blockPerGrid = dim3(gridx, gridy, gridz); - -// // printf("lpk_reo_1 exec config (%d %d %d) (%d %d %d)\n", tbx, tby, tbz, -// // gridx, gridy, gridz); -// _lpk_reo_1<<>>( -// shape_d, shape_c_d, ldvs, ldws, processed_n, processed_dims_d, -// curr_dim_r, curr_dim_c, curr_dim_f, ddist_f, dratio_f, dv1, lddv11, -// lddv12, dv2, lddv21, lddv22, dw, lddw1, lddw2); -// gpuErrchk(cudaGetLastError()); -// if (handle.sync_and_check_all_kernels) { -// gpuErrchk(cudaDeviceSynchronize()); -// } -// } - -// template -// void lpk_reo_1(Handle &handle, SIZE *shape_h, SIZE *shape_c_h, SIZE -// *shape_d, -// SIZE *shape_c_d, SIZE *ldvs, SIZE *ldws, DIM processed_n, -// DIM *processed_dims_h, DIM *processed_dims_d, DIM curr_dim_r, -// DIM curr_dim_c, DIM curr_dim_f, T *ddist_f, T *dratio_f, T -// *dv1, LENGTH lddv11, LENGTH lddv12, T *dv2, LENGTH lddv21, -// LENGTH lddv22, T *dw, LENGTH lddw1, LENGTH lddw2, int -// queue_idx, int config) { -// #define LPK(R, C, F) \ -// { \ -// lpk_reo_1_adaptive_launcher( \ -// handle, shape_h, shape_c_h, shape_d, shape_c_d, ldvs, ldws, \ -// processed_n, processed_dims_h, processed_dims_d, curr_dim_r, \ -// curr_dim_c, curr_dim_f, ddist_f, dratio_f, dv1, lddv11, lddv12, dv2, -// \ -// lddv21, lddv22, dw, lddw1, lddw2, queue_idx); \ -// } - -// bool profile = false; -// if (handle.profile_kernels) { -// profile = true; -// } -// if (D >= 3) { -// if (profile || config == 6) { -// LPK(2, 2, 128) -// } -// if (profile || config == 5) { -// LPK(2, 2, 64) -// } -// if (profile || config == 4) { -// LPK(2, 2, 32) -// } -// if (profile || config == 3) { -// LPK(4, 4, 16) -// } -// if (profile || config == 2) { -// LPK(8, 8, 8) -// } -// if (profile || config == 1) { -// LPK(4, 4, 4) -// } -// if (profile || config == 0) { -// LPK(2, 2, 2) -// } -// } else if (D == 2) { -// if (profile || config == 6) { -// LPK(1, 2, 128) -// } -// if (profile || config == 5) { -// LPK(1, 2, 64) -// } -// if (profile || config == 4) { -// LPK(1, 2, 32) -// } -// if (profile || config == 3) { -// LPK(1, 4, 16) -// } -// if (profile || config == 2) { -// LPK(1, 8, 8) -// } -// if (profile || config == 1) { -// LPK(1, 4, 4) -// } -// if (profile || config == 0) { -// LPK(1, 2, 4) -// } -// } else if (D == 1) { -// if (profile || config == 6) { -// LPK(1, 1, 128) -// } -// if (profile || config == 5) { -// LPK(1, 1, 64) -// } -// if (profile || config == 4) { -// LPK(1, 1, 32) -// } -// if (profile || config == 3) { -// LPK(1, 1, 16) -// } -// if (profile || config == 2) { -// LPK(1, 1, 8) -// } -// if (profile || config == 1) { -// LPK(1, 1, 8) -// } -// if (profile || config == 0) { -// LPK(1, 1, 8) -// } -// } - -// #undef LPK -// } - -// template -// __global__ void -// _lpk_reo_2(SIZE *shape, SIZE *shape_c, SIZE *ldvs, SIZE *ldws, DIM -// processed_n, -// DIM *processed_dims, DIM curr_dim_r, DIM curr_dim_c, DIM -// curr_dim_f, T *ddist_c, T *dratio_c, T *dv1, LENGTH lddv11, LENGTH -// lddv12, T *dv2, LENGTH lddv21, LENGTH lddv22, T *dw, LENGTH lddw1, -// LENGTH lddw2) { - -// // bool debug = false; -// // if (blockIdx.z == 0 && blockIdx.y == 0 && blockIdx.x == 0 && -// // threadIdx.z == 0 && threadIdx.x == 0 ) debug = false; - -// // bool debug2 = false; -// // if (threadIdx.z == 0 && threadIdx.y == 0 && threadIdx.x == 0 ) debug2 = -// // false; - -// LENGTH threadId = (threadIdx.z * (blockDim.x * blockDim.y)) + -// (threadIdx.y * blockDim.x) + threadIdx.x; - -// T *sm = SharedMemory(); -// SIZE ldsm1 = F; -// SIZE ldsm2 = C * 2 + 3; -// T *v_sm = sm; sm += ldsm1 * ldsm2 * R; - -// T *dist_c_sm = sm; sm += ldsm2; -// T *ratio_c_sm = sm; sm += ldsm2; - -// SIZE * sm_size = (SIZE*)sm; -// SIZE *shape_sm = sm_size; sm_size += D; -// SIZE *shape_c_sm = sm_size; sm_size += D; -// SIZE *ldvs_sm = sm_size; sm_size += D; -// SIZE *ldws_sm = sm_size; sm_size += D; -// sm = (T*)sm_size; - -// DIM * sm_dim = (DIM*)sm; -// DIM *processed_dims_sm = sm_dim; sm_dim += D; -// sm = (T*)sm_dim; - -// SIZE idx[D]; -// if (threadId < D) { -// shape_sm[threadId] = shape[threadId]; -// shape_c_sm[threadId] = shape_c[threadId]; -// ldvs_sm[threadId] = ldvs[threadId]; -// ldws_sm[threadId] = ldws[threadId]; -// } -// if (threadId < processed_n) { -// processed_dims_sm[threadId] = processed_dims[threadId]; -// } -// __syncthreads(); - -// for (DIM d = 0; d < D; d++) -// idx[d] = 0; - -// SIZE nr = shape_sm[curr_dim_r]; -// SIZE nc = shape_sm[curr_dim_c]; -// SIZE nf_c = shape_c_sm[curr_dim_f]; -// SIZE nc_c = shape_c_sm[curr_dim_c]; -// bool PADDING = (nc % 2 == 0); - -// if (D < 3) { -// nr = 1; -// } - -// SIZE bidx = blockIdx.x; -// SIZE firstD = div_roundup(nf_c, blockDim.x); -// SIZE blockId_f = bidx % firstD; -// bidx /= firstD; - -// for (DIM d = 0; d < D; d++) { -// if (d != curr_dim_r && d != curr_dim_c && d != curr_dim_f) { -// SIZE t = shape_sm[d]; -// for (DIM k = 0; k < processed_n; k++) { -// if (d == processed_dims[k]) { -// t = shape_c_sm[d]; -// } -// } -// idx[d] = bidx % t; -// bidx /= t; -// } -// } - -// LENGTH other_offset_v = get_idx(ldvs_sm, idx); -// LENGTH other_offset_w = get_idx(ldws_sm, idx); - -// dv1 = dv1 + other_offset_v; -// dv2 = dv2 + other_offset_v; -// dw = dw + other_offset_w; - -// // if (debug2) { -// // printf("idx: %d %d %d %d\n", idx[3], idx[2], idx[1], idx[0]); -// // printf("other_offset_v: %llu\n", other_offset_v); -// // printf("other_offset_w: %llu\n", other_offset_w); -// // } - -// SIZE r_gl = blockIdx.z * blockDim.z + threadIdx.z; -// SIZE c_gl = blockIdx.y * blockDim.y + threadIdx.y; -// SIZE f_gl = blockId_f * blockDim.x + threadIdx.x; - -// SIZE blockId = blockIdx.y; - -// SIZE r_sm = threadIdx.z; -// SIZE c_sm = threadIdx.y; -// SIZE f_sm = threadIdx.x; - -// SIZE actual_C = C; -// if (nc_c - blockIdx.y * blockDim.y < C) { -// actual_C = nc_c - blockIdx.y * blockDim.y; -// } - -// // if (nc_c % 2 == 1){ -// // if(nc_c-1 - blockIdx.y * blockDim.y < C) { actual_C = nc_c - 1 - -// // blockIdx.y * blockDim.y; } -// // } else { -// // if(nc_c - blockIdx.y * blockDim.y < C) { actual_C = nc_c - blockIdx.y -// * -// // blockDim.y; } -// // } - -// // bool debug = false; -// // if (idx[3] == 0 && r_gl == 0 ) debug = false; - -// // if (debug) printf("actual_C %d\n", actual_C); - -// if (r_gl < nr && c_gl < nc_c && f_gl < nf_c) { -// // if (debug) printf("load up vsm[%d]: %f <- %d %d %d\n", c_sm * 2 + 2, -// // dv1[get_idx(lddv11, lddv12, r_gl, c_gl, f_gl)], r_gl, c_gl, f_gl); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm * 2 + 2, f_sm)] = -// dv1[get_idx(lddv11, lddv12, r_gl, c_gl, f_gl)]; - -// if (c_sm == actual_C - 1) { -// if (c_gl + 1 < nc_c) { -// // if (debug) printf("load up+1 vsm[%d]: %f <- %d %d %d\n", actual_C -// * 2 -// // + 2, dv1[get_idx(lddv11, lddv12, r_gl, blockId * C + actual_C, -// // f_gl)], r_gl, blockId * C + actual_C, f_gl); -// // c_gl+1 == blockId * C + C -// v_sm[get_idx(ldsm1, ldsm2, r_sm, actual_C * 2 + 2, f_sm)] = -// dv1[get_idx(lddv11, lddv12, r_gl, c_gl + 1, f_gl)]; -// } else { -// // if (debug) printf("load up+1 vsm[%d]: 0.0\n", actual_C * 2 + 2); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, actual_C * 2 + 2, f_sm)] = 0.0; -// } -// } - -// if (c_sm == 0) { -// if (c_gl >= 1) { -// // if (debug) printf("load up-1 vsm[0]: %f <- %d %d %d\n", -// // dv1[get_idx(lddv11, lddv12, r_gl, c_gl-1, f_gl)], r_gl, c_gl-1, -// // f_gl); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)] = -// dv1[get_idx(lddv11, lddv12, r_gl, c_gl - 1, f_gl)]; -// } else { -// // if (debug) printf("load up-1 vsm[0]: 0.0\n"); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)] = 0.0; -// } -// } - -// if (!PADDING) { -// if (c_gl < nc_c - 1) { -// // if (debug) printf("load down vsm[%d]: %f <- %d %d %d\n", c_sm * 2 -// + -// // 3, dv2[get_idx(lddv11, lddv12, r_gl, c_gl, f_gl)], r_gl, c_gl, -// f_gl); v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm * 2 + 3, f_sm)] = -// dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl)]; -// } else { -// // if (debug) printf("load down vsm[%d]: 0.0\n", c_sm * 2 + 3); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm * 2 + 3, f_sm)] = 0.0; -// } -// } else { -// if (c_gl < nc_c - 2) { -// // if (debug) printf("load down vsm[%d]: %f <- %d %d %d\n", c_sm * 2 -// + -// // 3, dv2[get_idx(lddv11, lddv12, r_gl, c_gl, f_gl)], r_gl, c_gl, -// f_gl); v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm * 2 + 3, f_sm)] = -// dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl)]; -// } else { -// // if (debug) printf("load down vsm[%d]: 0.0\n", c_sm * 2 + 3); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm * 2 + 3, f_sm)] = 0.0; -// } -// } - -// if (c_gl >= 1 && -// (PADDING && c_gl - 1 < nc_c - 2 || !PADDING && c_gl - 1 < nc_c - 1)) -// { -// if (c_sm == 0) { -// // if (debug) printf("load down-1 vsm[1]: %f <- %d %d %d\n", -// // dv2[get_idx(lddv11, lddv12, r_gl, c_gl-1, f_gl)], r_gl, c_gl-1, -// // f_gl); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, 1, f_sm)] = -// dv2[get_idx(lddv11, lddv12, r_gl, c_gl - 1, f_gl)]; -// } -// } else { -// if (c_sm == 0) { -// // if (debug) printf("load down-1 vsm[1]: 0.0\n"); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, 1, f_sm)] = 0.0; -// } -// } -// } - -// // load dist/ratio using f_sm for better performance -// // assumption F >= C -// if (r_sm == 0 && c_sm == 0 && f_sm < actual_C) { -// if (blockId * C * 2 + f_sm < nc) { -// dist_c_sm[2 + f_sm] = ddist_c[blockId * C * 2 + f_sm]; -// ratio_c_sm[2 + f_sm] = dratio_c[blockId * C * 2 + f_sm]; -// } else { -// dist_c_sm[2 + f_sm] = 0.0; -// ratio_c_sm[2 + f_sm] = 0.0; -// } - -// if (blockId * C * 2 + actual_C + f_sm < nc) { -// dist_c_sm[2 + actual_C + f_sm] = -// ddist_c[blockId * C * 2 + actual_C + f_sm]; -// ratio_c_sm[2 + actual_C + f_sm] = -// dratio_c[blockId * C * 2 + actual_C + f_sm]; -// } else { -// dist_c_sm[2 + actual_C + f_sm] = 0.0; -// ratio_c_sm[2 + actual_C + f_sm] = 0.0; -// } -// } - -// if (blockId > 0) { -// if (f_sm < 2) { -// dist_c_sm[f_sm] = ddist_c[blockId * C * 2 - 2 + f_sm]; -// ratio_c_sm[f_sm] = dratio_c[blockId * C * 2 - 2 + f_sm]; -// } -// } else { -// if (f_sm < 2) { -// dist_c_sm[f_sm] = 0.0; -// ratio_c_sm[f_sm] = 0.0; -// } -// } - -// __syncthreads(); - -// if (r_gl < nr && c_gl < nc_c && f_gl < nf_c) { -// T h1 = dist_c_sm[c_sm * 2]; -// T h2 = dist_c_sm[c_sm * 2 + 1]; -// T h3 = dist_c_sm[c_sm * 2 + 2]; -// T h4 = dist_c_sm[c_sm * 2 + 3]; -// T r1 = ratio_c_sm[c_sm * 2]; -// T r2 = ratio_c_sm[c_sm * 2 + 1]; -// T r3 = ratio_c_sm[c_sm * 2 + 2]; -// T r4 = 1 - r3; -// T a = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm * 2, f_sm)]; -// T b = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm * 2 + 1, f_sm)]; -// T c = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm * 2 + 2, f_sm)]; -// T d = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm * 2 + 3, f_sm)]; -// T e = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm * 2 + 4, f_sm)]; - -// // if (debug) { -// // printf("c_sm(%d) %f %f %f %f %f\n",c_sm, a,b,c,d,e); -// // printf("c_sm_h(%d) %f %f %f %f\n",c_sm, h1,h2,h3,h4); -// // printf("c_sm_r(%d) %f %f %f %f\n",c_sm, r1,r2,r3,r4); -// // } - -// // T tb = a * h1 + b * 2 * (h1+h2) + c * h2; -// // T tc = b * h2 + c * 2 * (h2+h3) + d * h3; -// // T td = c * h3 + d * 2 * (h3+h4) + e * h4; - -// // if (debug) printf("c_sm(%d) tb tc td tc: %f %f %f %f\n", f_sm, tb, tc, -// // td, tc+tb * r1 + td * r4); - -// // tc += tb * r1 + td * r4; - -// // if (r_gl == 0 && f_gl == 0 && r_sm == 0 && f_sm == 0) { -// // printf("mr2(%d) mm2: %f -> (%d %d %d)\n", c_sm, tc, r_gl, c_gl, -// f_gl); -// // // printf("f_sm(%d) b c d: %f %f %f\n", f_sm, tb, tc, td); -// // } - -// // if (debug) { -// // printf("f_sm(%d) %f %f %f %f %f f_sm_h %f %f %f %f f_sm_r %f %f %f -// %f, out: %f\n",f_sm, a,b,c,d,e, h1,h2,h3,h4,r1,r2,r3,r4, mass_trans(a, b, -// c, d, e, h1, h2, h3, h4, r1, r2, r3, r4)); -// // } - -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] = -// mass_trans(a, b, c, d, e, h1, h2, h3, h4, r1, r2, r3, r4); - -// // if (debug) printf("store[%d %d %d] %f \n", r_gl, c_gl, f_gl, -// // mass_trans(a, b, c, d, e, h1, h2, h3, h4, r1, r2, r3, r4)); - -// // printf("%d %d %d\n", r_gl, c_gl, f_gl); -// // if (blockId * C + C == nc-1) { -// // if (c_gl + 1 == nc_c - 1) { -// // // T te = h4 * d + 2 * h4 * e; -// // // te += td * r3; -// // dw[get_idx(lddw1, lddw2, r_gl, blockId * C + actual_C, f_gl)] = -// // mass_trans(c, d, e, (T)0.0, (T)0.0, -// // h1, h2, (T)0.0, (T)0.0, r1, r2, (T)0.0, (T)0.0); -// // } -// // } -// } -// } - -// template -// void lpk_reo_2_adaptive_launcher(Handle &handle, SIZE *shape_h, -// SIZE *shape_c_h, SIZE *shape_d, SIZE -// *shape_c_d, SIZE *ldvs, SIZE *ldws, DIM -// processed_n, DIM *processed_dims_h, DIM -// *processed_dims_d, DIM curr_dim_r, DIM -// curr_dim_c, DIM curr_dim_f, T *ddist_c, T -// *dratio_c, T *dv1, LENGTH lddv11, LENGTH -// lddv12, T *dv2, LENGTH lddv21, LENGTH -// lddv22, T *dw, LENGTH lddw1, LENGTH lddw2, -// int queue_idx) { - -// SIZE nr = shape_h[curr_dim_r]; -// SIZE nc = shape_h[curr_dim_c]; -// SIZE nf = shape_h[curr_dim_f]; -// SIZE nc_c = shape_c_h[curr_dim_c]; -// SIZE nf_c = shape_c_h[curr_dim_f]; - -// SIZE total_thread_z = nr; -// SIZE total_thread_y = nc_c; -// // if (nc_c % 2 == 1) { total_thread_y = nc_c - 1; } -// // else { total_thread_y = nc_c; } -// SIZE total_thread_x = nf_c; -// SIZE tbx, tby, tbz, gridx, gridy, gridz; -// dim3 threadsPerBlock, blockPerGrid; -// size_t sm_size; - -// tbz = R; -// tby = C; -// tbx = F; -// sm_size = (R * (C * 2 + 3) * F + (C * 2 + 3) * 2) * sizeof(T); -// sm_size += (D * 4) * sizeof(SIZE); -// sm_size += (D * 1) * sizeof(DIM); -// gridz = ceil((float)total_thread_z / tbz); -// gridy = ceil((float)total_thread_y / tby); -// gridx = ceil((float)total_thread_x / tbx); -// for (DIM d = 0; d < D; d++) { -// if (d != curr_dim_f && d != curr_dim_c && d != curr_dim_r) { -// SIZE t = shape_h[d]; -// for (DIM k = 0; k < processed_n; k++) { -// if (d == processed_dims_h[k]) { -// t = shape_c_h[d]; -// } -// } -// gridx *= t; -// } -// } -// threadsPerBlock = dim3(tbx, tby, tbz); -// blockPerGrid = dim3(gridx, gridy, gridz); -// // printf("lpk_reo_2 exec config (%d %d %d) (%d %d %d)\n", tbx, tby, tbz, -// // gridx, gridy, gridz); - -// _lpk_reo_2<<>>( -// shape_d, shape_c_d, ldvs, ldws, processed_n, processed_dims_d, -// curr_dim_r, curr_dim_c, curr_dim_f, ddist_c, dratio_c, dv1, lddv11, -// lddv12, dv2, lddv21, lddv22, dw, lddw1, lddw2); -// gpuErrchk(cudaGetLastError()); -// if (handle.sync_and_check_all_kernels) { -// gpuErrchk(cudaDeviceSynchronize()); -// } -// } - -// template -// void lpk_reo_2(Handle &handle, SIZE *shape_h, SIZE *shape_c_h, SIZE -// *shape_d, -// SIZE *shape_c_d, SIZE *ldvs, SIZE *ldws, DIM processed_n, -// DIM *processed_dims_h, DIM *processed_dims_d, DIM curr_dim_r, -// DIM curr_dim_c, DIM curr_dim_f, T *ddist_c, T *dratio_c, T -// *dv1, LENGTH lddv11, LENGTH lddv12, T *dv2, LENGTH lddv21, -// LENGTH lddv22, T *dw, LENGTH lddw1, LENGTH lddw2, int -// queue_idx, int config) { - -// #define LPK(R, C, F) \ -// { \ -// lpk_reo_2_adaptive_launcher( \ -// handle, shape_h, shape_c_h, shape_d, shape_c_d, ldvs, ldws, \ -// processed_n, processed_dims_h, processed_dims_d,\ -// curr_dim_r, \ -// curr_dim_c, curr_dim_f, ddist_c, dratio_c, dv1, lddv11, lddv12, dv2, \ -// lddv21, lddv22, dw, lddw1, lddw2, queue_idx); \ -// } - -// bool profile = false; -// if (handle.profile_kernels) { -// profile = true; -// } -// if (D >= 3) { -// if (profile || config == 6) { -// LPK(2, 2, 128) -// } -// if (profile || config == 5) { -// LPK(2, 2, 64) -// } -// if (profile || config == 4) { -// LPK(2, 2, 32) -// } -// if (profile || config == 3) { -// LPK(4, 4, 16) -// } -// if (profile || config == 2) { -// LPK(8, 8, 8) -// } -// if (profile || config == 1) { -// LPK(4, 4, 4) -// } -// if (profile || config == 0) { -// LPK(2, 2, 2) -// } -// } else if (D == 2) { -// if (profile || config == 6) { -// LPK(1, 2, 128) -// } -// if (profile || config == 5) { -// LPK(1, 2, 64) -// } -// if (profile || config == 4) { -// LPK(1, 2, 32) -// } -// if (profile || config == 3) { -// LPK(1, 4, 16) -// } -// if (profile || config == 2) { -// LPK(1, 8, 8) -// } -// if (profile || config == 1) { -// LPK(1, 4, 4) -// } -// if (profile || config == 0) { -// LPK(1, 2, 4) -// } -// } else { -// printf("Error: mass_trans_multiply_2_cpt is only for 3D and 2D data\n"); -// } -// #undef LPK -// } - -// template -// __global__ void -// _lpk_reo_3(SIZE *shape, SIZE *shape_c, SIZE *ldvs, SIZE *ldws, DIM -// processed_n, -// DIM *processed_dims, DIM curr_dim_r, DIM curr_dim_c, DIM -// curr_dim_f, T *ddist_r, T *dratio_r, T *dv1, LENGTH lddv11, LENGTH -// lddv12, T *dv2, LENGTH lddv21, LENGTH lddv22, T *dw, LENGTH lddw1, -// LENGTH lddw2) { - -// // bool debug = false; -// // if (blockIdx.z == gridDim.z-1 && blockIdx.y == 0 && blockIdx.x == 0 && -// // threadIdx.y == 0 && threadIdx.x == 0 ) debug = false; - -// // bool debug2 = false; -// // if (blockIdx.z == gridDim.z-1 && blockIdx.y == 1 && blockIdx.x == 16) -// // debug2 = false; - -// LENGTH threadId = (threadIdx.z * (blockDim.x * blockDim.y)) + -// (threadIdx.y * blockDim.x) + threadIdx.x; - -// T *sm = SharedMemory(); -// SIZE ldsm1 = F; -// SIZE ldsm2 = C; -// T *v_sm = sm; sm += ldsm1 * ldsm2 * (R * 2 + 3); - -// T *dist_r_sm = sm; sm += (R * 2 + 3); -// T *ratio_r_sm = sm; sm += (R * 2 + 3); - -// SIZE * sm_size = (SIZE*)sm; -// SIZE *shape_sm = sm_size; sm_size += D; -// SIZE *shape_c_sm = sm_size; sm_size += D; -// SIZE *ldvs_sm = sm_size; sm_size += D; -// SIZE *ldws_sm = sm_size; sm_size += D; -// sm = (T*)sm_size; - -// DIM * sm_dim = (DIM*)sm; -// DIM *processed_dims_sm = sm_dim; sm_dim += D; -// sm = (T*)sm_dim; - -// SIZE idx[D]; -// if (threadId < D) { -// shape_sm[threadId] = shape[threadId]; -// shape_c_sm[threadId] = shape_c[threadId]; -// ldvs_sm[threadId] = ldvs[threadId]; -// ldws_sm[threadId] = ldws[threadId]; -// } -// if (threadId < processed_n) { -// processed_dims_sm[threadId] = processed_dims[threadId]; -// } -// __syncthreads(); - -// for (DIM d = 0; d < D; d++) -// idx[d] = 0; - -// SIZE nr = shape_sm[curr_dim_r]; -// SIZE nf_c = shape_c_sm[curr_dim_f]; -// SIZE nc_c = shape_c_sm[curr_dim_c]; -// SIZE nr_c = shape_c_sm[curr_dim_r]; -// bool PADDING = (nr % 2 == 0); - -// SIZE bidx = blockIdx.x; -// SIZE firstD = div_roundup(nf_c, blockDim.x); -// SIZE blockId_f = bidx % firstD; -// bidx /= firstD; - -// for (DIM d = 0; d < D; d++) { -// if (d != curr_dim_r && d != curr_dim_c && d != curr_dim_f) { -// SIZE t = shape_sm[d]; -// for (DIM k = 0; k < processed_n; k++) { -// if (d == processed_dims[k]) { -// t = shape_c_sm[d]; -// } -// } -// idx[d] = bidx % t; -// bidx /= t; -// } -// } - -// LENGTH other_offset_v = get_idx(ldvs_sm, idx); -// LENGTH other_offset_w = get_idx(ldws_sm, idx); - -// dv1 = dv1 + other_offset_v; -// dv2 = dv2 + other_offset_v; -// dw = dw + other_offset_w; - -// // if (debug2) { -// // printf("idx: %d %d %d %d\n", idx[3], idx[2], idx[1], idx[0]); -// // printf("other_offset_v: %llu\n", other_offset_v); -// // printf("other_offset_w: %llu\n", other_offset_w); -// // } - -// SIZE r_gl = blockIdx.z * blockDim.z + threadIdx.z; -// SIZE c_gl = blockIdx.y * blockDim.y + threadIdx.y; -// SIZE f_gl = blockId_f * blockDim.x + threadIdx.x; - -// SIZE blockId = blockIdx.z; - -// SIZE r_sm = threadIdx.z; -// SIZE c_sm = threadIdx.y; -// SIZE f_sm = threadIdx.x; - -// SIZE actual_R = R; -// if (nr_c - blockIdx.z * blockDim.z < R) { -// actual_R = nr_c - blockIdx.z * blockDim.z; -// } -// // if (nr_c % 2 == 1){ -// // if(nr_c-1 - blockIdx.z * blockDim.z < R) { actual_R = nr_c - 1 - -// // blockIdx.z * blockDim.z; } -// // } else { -// // if(nr_c - blockIdx.z * blockDim.z < R) { actual_R = nr_c - blockIdx.z -// * -// // blockDim.z; } -// // } - -// // if (debug) printf("actual_R %d\n", actual_R); - -// // bool debug = false; -// // if (idx[3] == 0 && idx[2] == 0 && f_gl == 2 && c_gl == 1) debug = -// false; - -// // if (debug) printf("RCF: %d %d %d\n", R, C, F); -// if (r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) { -// // if (debug) printf("load front vsm[%d]: %f <- %d %d %d\n", r_sm * 2 + -// 2, -// // dv1[get_idx(lddv11, lddv12, r_gl, c_gl, f_gl)], r_gl, c_gl, f_gl); -// v_sm[get_idx(ldsm1, ldsm2, r_sm * 2 + 2, c_sm, f_sm)] = -// dv1[get_idx(lddv11, lddv12, r_gl, c_gl, f_gl)]; - -// if (r_sm == actual_R - 1) { -// if (r_gl + 1 < nr_c) { -// // if (debug) printf("load front+1 vsm[%d]: %f <- %d %d %d\n", -// actual_R -// // * 2 + 2, dv1[get_idx(lddv11, lddv12, blockId * R + actual_R, c_gl, -// // f_gl)], blockId * R + actual_R, c_gl, f_gl); -// v_sm[get_idx(ldsm1, ldsm2, actual_R * 2 + 2, c_sm, f_sm)] = -// dv1[get_idx(lddv11, lddv12, r_gl + 1, c_gl, f_gl)]; -// } else { -// // if (debug) printf("load front+1 vsm[%d]: 0.0\n", actual_R * 2 + -// 2); v_sm[get_idx(ldsm1, ldsm2, actual_R * 2 + 2, c_sm, f_sm)] = 0.0; -// } -// } - -// if (r_sm == 0) { -// if (r_gl >= 1) { -// // if (debug) printf("load front-1 vsm[0]: %f <- %d %d %d\n", -// // dv1[get_idx(lddv11, lddv12, r_gl-1, c_gl, f_gl)], r_gl-1, c_gl, -// // f_gl); -// v_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] = -// dv1[get_idx(lddv11, lddv12, r_gl - 1, c_gl, f_gl)]; -// } else { -// // if (debug) printf("load front-1 vsm[0]: 0.0\n"); -// v_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] = 0.0; -// } -// } - -// if (!PADDING) { -// if (r_gl < nr_c - 1) { -// // if (debug) printf("load back vsm[%d]: 0.0\n", r_sm * 2 + 3); -// v_sm[get_idx(ldsm1, ldsm2, r_sm * 2 + 3, c_sm, f_sm)] = -// dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl)]; -// } else { -// // if (debug) printf("load back vsm[%d]: %f <- %d %d %d\n", r_sm * 2 -// + -// // 3, dv2[get_idx(lddv11, lddv12, r_gl, c_gl, f_gl)], r_gl, c_gl, -// f_gl); v_sm[get_idx(ldsm1, ldsm2, r_sm * 2 + 3, c_sm, f_sm)] = 0.0; -// } -// } else { -// if (r_gl < nr_c - 2) { -// // if (debug) printf("load back vsm[%d]: %f <- %d %d %d\n", r_sm * 2 -// + -// // 3, dv2[get_idx(lddv11, lddv12, r_gl, c_gl, f_gl)], r_gl, c_gl, -// f_gl); v_sm[get_idx(ldsm1, ldsm2, r_sm * 2 + 3, c_sm, f_sm)] = -// dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl)]; -// } else { -// // if (debug) printf("load back vsm[%d]: 0.0\n", r_sm * 2 + 3); -// v_sm[get_idx(ldsm1, ldsm2, r_sm * 2 + 3, c_sm, f_sm)] = 0.0; -// } -// } - -// if (r_gl >= 1 && -// (PADDING && r_gl - 1 < nr_c - 2 || !PADDING && r_gl - 1 < nr_c - 1)) -// { -// // if (blockId > 0) { -// if (r_sm == 0) { -// // if (debug) printf("load back-1 vsm[1]: %f <- %d %d %d\n", -// // dv2[get_idx(lddv11, lddv12, r_gl-1, c_gl, f_gl)], r_gl-1, c_gl, -// // f_gl); -// v_sm[get_idx(ldsm1, ldsm2, 1, c_sm, f_sm)] = -// dv2[get_idx(lddv11, lddv12, r_gl - 1, c_gl, f_gl)]; -// } -// } else { -// if (r_sm == 0) { -// // if (debug) printf("load back-1 vsm[1]: 0.0\n"); -// v_sm[get_idx(ldsm1, ldsm2, 1, c_sm, f_sm)] = 0.0; -// } -// } -// } - -// // load dist/ratio using f_sm for better performance -// // assumption F >= R -// if (r_sm == 0 && c_sm == 0 && f_sm < actual_R) { -// if (blockId * R * 2 + f_sm < nr) { -// dist_r_sm[2 + f_sm] = ddist_r[blockId * R * 2 + f_sm]; -// // if (debug2 ) printf("load dist 1 [%d]: %f [%d]\n", 2 + f_sm, -// // dist_r_sm[2 + f_sm], blockId * R * 2 + f_sm); -// ratio_r_sm[2 + f_sm] = dratio_r[blockId * R * 2 + f_sm]; -// // if (debug2 )printf("load ratio 1 [%d]: %f [%d]\n", 2 + f_sm, -// // ratio_r_sm[2 + f_sm], blockId * R * 2 + f_sm); -// } else { -// dist_r_sm[2 + f_sm] = 0.0; -// ratio_r_sm[2 + f_sm] = 0.0; -// } -// if (blockId * R * 2 + actual_R + f_sm < nr) { -// dist_r_sm[2 + actual_R + f_sm] = -// ddist_r[blockId * R * 2 + actual_R + f_sm]; -// // if (debug2 )printf("load dist 2 [%d]: %f [%d]\n", 2 + actual_R + -// f_sm, -// // dist_r_sm[2 + actual_R + f_sm], blockId * R * 2 + actual_R + f_sm); -// ratio_r_sm[2 + actual_R + f_sm] = -// dratio_r[blockId * R * 2 + actual_R + f_sm]; -// // if (debug2 )printf("load ratio 2 [%d]: %f [%d]\n", 2 + actual_R + -// f_sm, -// // ratio_r_sm[2 + actual_R + f_sm], blockId * R * 2 + actual_R + f_sm); -// } else { -// dist_r_sm[2 + actual_R + f_sm] = 0.0; -// ratio_r_sm[2 + actual_R + f_sm] = 0.0; -// } -// } - -// if (blockId > 0) { -// if (f_sm < 2) { -// dist_r_sm[f_sm] = ddist_r[blockId * R * 2 - 2 + f_sm]; -// // if (debug2 )printf("load dist -1 [%d]: %f [%d]\n", f_sm, -// // dist_r_sm[f_sm], blockId * R * 2 - 2 + f_sm); -// ratio_r_sm[f_sm] = dratio_r[blockId * R * 2 - 2 + f_sm]; -// // if (debug2 )printf("load ratio -1 [%d]: %f [%d]\n", f_sm, -// // ratio_r_sm[f_sm], blockId * R * 2 - 2 + f_sm); -// } -// } else { -// if (f_sm < 2) { -// dist_r_sm[f_sm] = 0.0; -// ratio_r_sm[f_sm] = 0.0; -// } -// } - -// __syncthreads(); - -// int adjusted_nr_c = nr_c; -// if (r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) { -// T h1 = dist_r_sm[r_sm * 2]; -// T h2 = dist_r_sm[r_sm * 2 + 1]; -// T h3 = dist_r_sm[r_sm * 2 + 2]; -// T h4 = dist_r_sm[r_sm * 2 + 3]; -// T r1 = ratio_r_sm[r_sm * 2]; -// T r2 = ratio_r_sm[r_sm * 2 + 1]; -// T r3 = ratio_r_sm[r_sm * 2 + 2]; -// T r4 = 1 - r3; -// T a = v_sm[get_idx(ldsm1, ldsm2, r_sm * 2, c_sm, f_sm)]; -// T b = v_sm[get_idx(ldsm1, ldsm2, r_sm * 2 + 1, c_sm, f_sm)]; -// T c = v_sm[get_idx(ldsm1, ldsm2, r_sm * 2 + 2, c_sm, f_sm)]; -// T d = v_sm[get_idx(ldsm1, ldsm2, r_sm * 2 + 3, c_sm, f_sm)]; -// T e = v_sm[get_idx(ldsm1, ldsm2, r_sm * 2 + 4, c_sm, f_sm)]; - -// // __syncthreads(); -// // if (debug) { -// // printf("r_sm(%d) %f %f %f %f %f\n",r_sm, a,b,c,d,e); -// // printf("r_sm_h(%d) %f %f %f %f\n",r_sm, h1,h2,h3,h4); -// // printf("r_sm_r(%d) %f %f %f %f\n",r_sm, r1,r2,r3,r4); -// // } -// // __syncthreads(); - -// // T tb = a * h1 + b * 2 * (h1+h2) + c * h2; -// // T tc = b * h2 + c * 2 * (h2+h3) + d * h3; -// // T td = c * h3 + d * 2 * (h3+h4) + e * h4; - -// // if (debug) printf("f_sm(%d) tb tc td tc: %f %f %f %f\n", f_sm, tb, tc, -// // td, tc+tb * r1 + td * r4); - -// // tc += tb * r1 + td * r4; - -// // if (debug) { -// // printf("f_sm(%d) %f %f %f %f %f f_sm_h %f %f %f %f f_sm_r %f %f %f -// %f, out: %f\n",f_sm, a,b,c,d,e, h1,h2,h3,h4,r1,r2,r3,r4, mass_trans(a, b, -// c, d, e, h1, h2, h3, h4, r1, r2, r3, r4)); -// // } - -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] = -// mass_trans(a, b, c, d, e, h1, h2, h3, h4, r1, r2, r3, r4); - -// // if (debug) printf("store[%d %d %d] %f (%f)\n", r_gl, c_gl, f_gl, -// // mass_trans(a, b, c, d, e, h1, h2, h3, h4, r1, r2, r3, r4), -// // mass_trans(a, b, c, (T)0.0, (T)0.0, h1, (T)0.0, -// (T)0.0, -// // h4, r1, r2, (T)0.0, (T)0.0)); -// // // printf("%d %d %d\n", r_gl, c_gl, f_gl); -// // if (blockId * R + R == nr-1) { -// // if (r_gl+1 == nr_c - 1) { -// // if (r_gl+1 == nr_c - 1) { -// // // T te = h4 * d + 2 * h4 * e; -// // // te += td * r3; -// // dw[get_idx(lddw1, lddw2, blockId * R + actual_R, c_gl, f_gl)] = -// // mass_trans(c, d, e, (T)0.0, (T)0.0, -// // h1, h2, (T)0.0, (T)0.0, r1, r2, (T)0.0, (T)0.0); - -// // if (debug) printf("store-last[%d %d %d] %f\n", blockId * R + -// actual_R, -// // c_gl, f_gl, -// // mass_trans(c, d, e, (T)0.0, (T)0.0, -// // h1, h2, (T)0.0, (T)0.0, r1, r2, (T)0.0, (T)0.0)); -// // } -// //} -// } -// } - -// template -// void lpk_reo_3_adaptive_launcher(Handle &handle, SIZE *shape_h, -// SIZE *shape_c_h, SIZE *shape_d, SIZE -// *shape_c_d, SIZE *ldvs, SIZE *ldws, DIM -// processed_n, DIM *processed_dims_h, DIM -// *processed_dims_d, DIM curr_dim_r, DIM -// curr_dim_c, DIM curr_dim_f, T *ddist_r, T -// *dratio_r, T *dv1, LENGTH lddv11, LENGTH -// lddv12, T *dv2, LENGTH lddv21, LENGTH -// lddv22, T *dw, LENGTH lddw1, LENGTH lddw2, -// int queue_idx) { - -// SIZE nr = shape_h[curr_dim_r]; -// SIZE nc = shape_h[curr_dim_c]; -// SIZE nf = shape_h[curr_dim_f]; -// SIZE nr_c = shape_c_h[curr_dim_r]; -// SIZE nc_c = shape_c_h[curr_dim_c]; -// SIZE nf_c = shape_c_h[curr_dim_f]; - -// SIZE total_thread_z = nr_c; -// // if (nr_c % 2 == 1){ total_thread_z = nr_c - 1; } -// // else { total_thread_z = nr_c; } -// SIZE total_thread_y = nc_c; -// SIZE total_thread_x = nf_c; - -// SIZE tbx, tby, tbz, gridx, gridy, gridz; -// dim3 threadsPerBlock, blockPerGrid; -// size_t sm_size; - -// tbz = R; -// tby = C; -// tbx = F; -// sm_size = ((R * 2 + 3) * C * F + (R * 2 + 3) * 2) * sizeof(T); -// sm_size += (D * 4) * sizeof(SIZE); -// sm_size += (D * 1) * sizeof(DIM); -// gridz = ceil((float)total_thread_z / tbz); -// gridy = ceil((float)total_thread_y / tby); -// gridx = ceil((float)total_thread_x / tbx); -// for (DIM d = 0; d < D; d++) { -// if (d != curr_dim_f && d != curr_dim_c && d != curr_dim_r) { -// SIZE t = shape_h[d]; -// for (DIM k = 0; k < processed_n; k++) { -// if (d == processed_dims_h[k]) { -// t = shape_c_h[d]; -// } -// } -// gridx *= t; -// } -// } -// threadsPerBlock = dim3(tbx, tby, tbz); -// blockPerGrid = dim3(gridx, gridy, gridz); -// // printf("lpk_reo_3 exec config (%d %d %d) (%d %d %d)\n", tbx, tby, tbz, -// // gridx, gridy, gridz); - -// _lpk_reo_3<<>>( -// shape_d, shape_c_d, ldvs, ldws, processed_n, processed_dims_d, -// curr_dim_r, curr_dim_c, curr_dim_f, ddist_r, dratio_r, dv1, lddv11, -// lddv12, dv2, lddv21, lddv22, dw, lddw1, lddw2); -// gpuErrchk(cudaGetLastError()); -// if (handle.sync_and_check_all_kernels) { -// gpuErrchk(cudaDeviceSynchronize()); -// } -// } - -// template -// void lpk_reo_3(Handle &handle, SIZE *shape_h, SIZE *shape_c_h, SIZE -// *shape_d, -// SIZE *shape_c_d, SIZE *ldvs, SIZE *ldws, DIM processed_n, -// DIM *processed_dims_h, DIM *processed_dims_d, DIM curr_dim_r, -// DIM curr_dim_c, DIM curr_dim_f, T *ddist_r, T *dratio_r, T -// *dv1, LENGTH lddv11, LENGTH lddv12, T *dv2, LENGTH lddv21, -// LENGTH lddv22, T *dw, LENGTH lddw1, LENGTH lddw2, int -// queue_idx, int config) { - -// #define LPK(R, C, F) \ -// { \ -// lpk_reo_3_adaptive_launcher( \ -// handle, shape_h, shape_c_h, shape_d, shape_c_d, ldvs, ldws, \ -// processed_n, processed_dims_h, processed_dims_d,\ -// curr_dim_r, \ -// curr_dim_c, curr_dim_f, ddist_r, dratio_r, dv1, lddv11, lddv12, dv2, \ -// lddv21, lddv22, dw, lddw1, lddw2, queue_idx); \ -// } -// bool profile = false; -// if (handle.profile_kernels) { -// profile = true; -// } -// if (D >= 3) { -// if (profile || config == 6) { -// LPK(2, 2, 128) -// } -// if (profile || config == 5) { -// LPK(2, 2, 64) -// } -// if (profile || config == 4) { -// LPK(2, 2, 32) -// } -// if (profile || config == 3) { -// LPK(4, 4, 16) -// } -// if (profile || config == 2) { -// LPK(8, 8, 8) -// } -// if (profile || config == 1) { -// LPK(4, 4, 4) -// } -// if (profile || config == 0) { -// LPK(2, 2, 2) -// } -// } else { -// printf("Error: mass_trans_multiply_3_cpt is only for 3D data\n"); -// } - -// #undef LPK -// } - } // namespace mgard_x #endif \ No newline at end of file diff --git a/include/mgard-x/DataRefactoring/MultiDimension/Correction/LinearProcessingKernel3D.h b/include/mgard-x/DataRefactoring/MultiDimension/Correction/LinearProcessingKernel3D.h deleted file mode 100644 index 922eecbaf1..0000000000 --- a/include/mgard-x/DataRefactoring/MultiDimension/Correction/LinearProcessingKernel3D.h +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#ifndef MGARD_X_LINEAR_PROCESSING_KERNEL_3D -#define MGARD_X_LINEAR_PROCESSING_KERNEL_3D - -#include "../../Common.h" - -namespace mgard_x { - -template -void lpk_reo_1_3d(Handle &handle, SIZE nr, SIZE nc, SIZE nf, SIZE nf_c, - SIZE zero_r, SIZE zero_c, SIZE zero_f, T *ddist_f, - T *dratio_f, T *dv1, SIZE lddv11, SIZE lddv12, T *dv2, - SIZE lddv21, SIZE lddv22, T *dw, SIZE lddw1, SIZE lddw2, - int queue_idx, int config); - -template -void lpk_reo_2_3d(Handle &handle, SIZE nr, SIZE nc, SIZE nf_c, SIZE nc_c, - T *ddist_c, T *dratio_c, T *dv1, SIZE lddv11, SIZE lddv12, - T *dv2, SIZE lddv21, SIZE lddv22, T *dw, SIZE lddw1, - SIZE lddw2, int queue_idx, int config); - -template -void lpk_reo_3_3d(Handle &handle, SIZE nr, SIZE nc_c, SIZE nf_c, - SIZE nr_c, T *ddist_r, T *dratio_r, T *dv1, SIZE lddv11, - SIZE lddv12, T *dv2, SIZE lddv21, SIZE lddv22, T *dw, - SIZE lddw1, SIZE lddw2, int queue_idx, int config); - -} // namespace mgard_x - -#endif \ No newline at end of file diff --git a/include/mgard-x/DataRefactoring/MultiDimension/Correction/LinearProcessingKernel3D.hpp b/include/mgard-x/DataRefactoring/MultiDimension/Correction/LinearProcessingKernel3D.hpp index 0b043fb729..e757c85457 100644 --- a/include/mgard-x/DataRefactoring/MultiDimension/Correction/LinearProcessingKernel3D.hpp +++ b/include/mgard-x/DataRefactoring/MultiDimension/Correction/LinearProcessingKernel3D.hpp @@ -435,14 +435,11 @@ class Lpk1Reo3D : public AutoTuner { SubArray dv1, SubArray dv2, SubArray dw, int queue_idx) { int range_l = std::min(6, (int)std::log2(nf) - 1); - int arch = DeviceRuntime::GetArchitectureGeneration(); int prec = TypeToIdx(); - // int config = - // AutoTuner::autoTuningTable.auto_tuning_mr1[arch][prec][range_l]; int config = AutoTuner::autoTuningTable.lpk1_3d[prec][range_l]; - double min_time = std::numeric_limits::max(); int min_config = 0; + ExecutionReturn ret; #define LPK(CONFIG) \ if (config == CONFIG || AutoTuner::ProfileKernels) { \ @@ -455,22 +452,26 @@ class Lpk1Reo3D : public AutoTuner { GenTask(nr, nc, nf, nf_c, zero_r, zero_c, zero_f, ddist_f, \ dratio_f, dv1, dv2, dw, queue_idx); \ DeviceAdapter adapter; \ - ExecutionReturn ret = adapter.Execute(task); \ + ret = adapter.Execute(task); \ if (AutoTuner::ProfileKernels) { \ - if (min_time > ret.execution_time) { \ + if (ret.success && min_time > ret.execution_time) { \ min_time = ret.execution_time; \ min_config = CONFIG; \ } \ } \ } - LPK(0) - LPK(1) - LPK(2) - LPK(3) - LPK(4) - LPK(5) - LPK(6) + LPK(6) if (!ret.success) config--; + LPK(5) if (!ret.success) config--; + LPK(4) if (!ret.success) config--; + LPK(3) if (!ret.success) config--; + LPK(2) if (!ret.success) config--; + LPK(1) if (!ret.success) config--; + LPK(0) if (!ret.success) config--; + if (config < 0 && !ret.success) { + std::cout << log::log_err << "no suitable config for Lpk1Reo3D.\n"; + exit(-1); + } #undef LPK if (AutoTuner::ProfileKernels) { @@ -787,14 +788,11 @@ class Lpk2Reo3D : public AutoTuner { SubArray dv1, SubArray dv2, SubArray dw, int queue_idx) { int range_l = std::min(6, (int)std::log2(nf_c) - 1); - int arch = DeviceRuntime::GetArchitectureGeneration(); int prec = TypeToIdx(); - // int config = - // AutoTuner::autoTuningTable.auto_tuning_mr2[arch][prec][range_l]; int config = AutoTuner::autoTuningTable.lpk2_3d[prec][range_l]; - double min_time = std::numeric_limits::max(); int min_config = 0; + ExecutionReturn ret; #define LPK(CONFIG) \ if (config == CONFIG || AutoTuner::ProfileKernels) { \ @@ -806,22 +804,26 @@ class Lpk2Reo3D : public AutoTuner { TaskType task = GenTask(nr, nc, nf_c, nc_c, ddist_c, dratio_c, \ dv1, dv2, dw, queue_idx); \ DeviceAdapter adapter; \ - ExecutionReturn ret = adapter.Execute(task); \ + ret = adapter.Execute(task); \ if (AutoTuner::ProfileKernels) { \ - if (min_time > ret.execution_time) { \ + if (ret.success && min_time > ret.execution_time) { \ min_time = ret.execution_time; \ min_config = CONFIG; \ } \ } \ } - LPK(0) - LPK(1) - LPK(2) - LPK(3) - LPK(4) - LPK(5) - LPK(6) + LPK(6) if (!ret.success) config--; + LPK(5) if (!ret.success) config--; + LPK(4) if (!ret.success) config--; + LPK(3) if (!ret.success) config--; + LPK(2) if (!ret.success) config--; + LPK(1) if (!ret.success) config--; + LPK(0) if (!ret.success) config--; + if (config < 0 && !ret.success) { + std::cout << log::log_err << "no suitable config for Lpk2Reo3D.\n"; + exit(-1); + } #undef LPK if (AutoTuner::ProfileKernels) { @@ -1156,14 +1158,11 @@ class Lpk3Reo3D : public AutoTuner { SubArray dv1, SubArray dv2, SubArray dw, int queue_idx) { int range_l = std::min(6, (int)std::log2(nf_c) - 1); - int arch = DeviceRuntime::GetArchitectureGeneration(); int prec = TypeToIdx(); - // int config = - // AutoTuner::autoTuningTable.auto_tuning_mr3[arch][prec][range_l]; int config = AutoTuner::autoTuningTable.lpk3_3d[prec][range_l]; - double min_time = std::numeric_limits::max(); int min_config = 0; + ExecutionReturn ret; #define LPK(CONFIG) \ if (config == CONFIG || AutoTuner::ProfileKernels) { \ @@ -1175,22 +1174,26 @@ class Lpk3Reo3D : public AutoTuner { TaskType task = GenTask(nr, nc_c, nf_c, nr_c, ddist_r, dratio_r, \ dv1, dv2, dw, queue_idx); \ DeviceAdapter adapter; \ - ExecutionReturn ret = adapter.Execute(task); \ + ret = adapter.Execute(task); \ if (AutoTuner::ProfileKernels) { \ - if (min_time > ret.execution_time) { \ + if (ret.success && min_time > ret.execution_time) { \ min_time = ret.execution_time; \ min_config = CONFIG; \ } \ } \ } - LPK(0) - LPK(1) - LPK(2) - LPK(3) - LPK(4) - LPK(5) - LPK(6) + LPK(6) if (!ret.success) config--; + LPK(5) if (!ret.success) config--; + LPK(4) if (!ret.success) config--; + LPK(3) if (!ret.success) config--; + LPK(2) if (!ret.success) config--; + LPK(1) if (!ret.success) config--; + LPK(0) if (!ret.success) config--; + if (config < 0 && !ret.success) { + std::cout << log::log_err << "no suitable config for Lpk3Reo3D.\n"; + exit(-1); + } #undef LPK if (AutoTuner::ProfileKernels) { @@ -1199,1130 +1202,6 @@ class Lpk3Reo3D : public AutoTuner { } }; -// template -// __global__ void _lpk_reo_1_3d(SIZE nr, SIZE nc, SIZE nf, SIZE nf_c, SIZE -// zero_r, -// SIZE zero_c, SIZE zero_f, T *ddist_f, T -// *dratio_f, T *dv1, SIZE lddv11, SIZE lddv12, T -// *dv2, SIZE lddv21, SIZE lddv22, T *dw, SIZE -// lddw1, SIZE lddw2) { - -// // bool debug = false; -// // if (blockIdx.z == 0 && blockIdx.y == 0 && blockIdx.x == 1 && -// // threadIdx.y == 0 && threadIdx.z == 0 ) debug = false; - -// // bool debug2 = false; -// // if (blockIdx.z == gridDim.z-1 && blockIdx.y == 1 && blockIdx.x == 16) -// // debug2 = false; - -// bool PADDING = (nf % 2 == 0); - -// T *sm = SharedMemory(); -// // extern __shared__ double sm[]; // size: (blockDim.x + 1) * (blockDim.y + -// 1) -// // * (blockDim.z + 1) -// SIZE ldsm1 = F * 2 + 3; -// SIZE ldsm2 = C; -// T *v_sm = sm; -// T *dist_f_sm = sm + ldsm1 * ldsm2 * R; -// T *ratio_f_sm = dist_f_sm + ldsm1; - -// // bool debug = false; -// // if (blockIdx.z == 0 && blockIdx.y == 0 && blockIdx.x == 0 && -// // threadIdx.z == 0 && threadIdx.y == 0 ) debug = true; - -// SIZE r_gl = blockIdx.z * blockDim.z + threadIdx.z; -// SIZE c_gl = blockIdx.y * blockDim.y + threadIdx.y; -// SIZE f_gl = blockIdx.x * blockDim.x + threadIdx.x; - -// SIZE blockId = blockIdx.x; - -// SIZE r_sm = threadIdx.z; -// SIZE c_sm = threadIdx.y; -// SIZE f_sm = threadIdx.x; - -// SIZE actual_F = F; -// if (nf_c - blockId * blockDim.x < F) { -// actual_F = nf_c - blockId * blockDim.x; -// } - -// // if (nf_c % 2 == 1){ -// // if(nf_c-1 - blockId * blockDim.x < F) { actual_F = nf_c - 1 - blockId -// * -// // blockDim.x; } -// // } else { -// // if(nf_c - blockId * blockDim.x < F) { actual_F = nf_c - blockId * -// // blockDim.x; } -// // } - -// // if (debug) printf("actual_F %d\n", actual_F); - -// if (r_gl < nr && c_gl < nc && f_gl < nf_c) { -// if (r_gl < zero_r && c_gl < zero_c && f_gl < zero_f) { -// // if (debug) printf("load left vsm[%d]: 0.0\n", f_sm * 2 + 2); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2 + 2)] = 0.0; -// } else { -// // if (debug) printf("load left vsm[%d]<-dv1[%d, %d, %d]: %f\n", f_sm * -// 2 -// // + 2, r_gl, c_gl, f_gl, dv1[get_idx(lddv11, lddv12, r_gl, c_gl, -// f_gl)]); v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2 + 2)] = -// dv1[get_idx(lddv11, lddv12, r_gl, c_gl, f_gl)]; -// } - -// if (f_sm == actual_F - 1) { -// if (r_gl < zero_r && c_gl < zero_c && f_gl < zero_f) { -// // if (debug) printf("load left+1 vsm[%d]: 0.0\n", actual_F * 2 + 2); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, actual_F * 2 + 2)] = 0.0; -// } else { -// if (f_gl + 1 < nf_c) { -// // if (debug) printf("load left+1 vsm[%d]: %f\n", actual_F * 2 + 2, -// // dv1[get_idx(lddv11, lddv12, r_gl, c_gl, f_gl + 1)]); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, actual_F * 2 + 2)] = -// dv1[get_idx(lddv11, lddv12, r_gl, c_gl, f_gl + 1)]; -// } else { -// // if (debug) printf("load left+1 vsm[%d]: 0.0\n", actual_F * 2 + -// 2); v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, actual_F * 2 + 2)] = -// 0.0; -// } -// } -// } - -// if (f_sm == 0) { -// // left -// if (r_gl < zero_r && c_gl < zero_c && f_gl < zero_f) { -// // coarse (-1) -// // if (debug) printf("load left-1 vsm[0]: 0.0\n"); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] = 0.0; -// } else { -// if (f_gl >= 1) { -// // other (-1) -// // if (debug) printf("load left-1 vsm[0]: %f\n", -// dv1[get_idx(lddv11, -// // lddv12, r_gl, c_gl, f_gl-1)]); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] = -// dv1[get_idx(lddv11, lddv12, r_gl, c_gl, f_gl - 1)]; -// } else { -// // other (-1) -// // if (debug) printf("load left-1 vsm[0]: 0.0\n"); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 0)] = 0.0; -// } -// } -// } - -// // right -// if (!PADDING) { -// if (nf_c % 2 != 0) { -// if (f_gl >= 1 && f_gl < nf_c ) { -// // if (debug) printf("load right vsm[%d]: %f <- %d %d %d\n", f_sm * -// 2 -// // + 1, dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl - 1)], r_gl, -// // c_gl, f_gl - 1); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2 + 1)] = -// dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl - 1)]; -// } else { -// // if (debug) printf("load right vsm[%d]: 0\n", f_sm * 2 + 1); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2 + 1)] = 0.0; -// } -// } else { // nf_c % 2 == 0 -// if (f_gl < nf_c - 1) { -// // if (debug) printf("load right vsm[%d]: %f <- %d %d %d\n", f_sm * -// 2 -// // + 3, dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl)], r_gl, c_gl, -// // f_gl); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2 + 3)] = -// dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl)]; -// } else { -// // if (debug) printf("load right vsm[%d]: 0\n", f_sm * 2 + 3); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2 + 3)] = 0.0; -// } -// } -// } else { // PADDING -// if (nf_c % 2 != 0) { -// if (f_gl >= 1 && f_gl < nf_c - 1) { -// // if (debug) printf("load right vsm[%d]: %f <- %d %d %d\n", f_sm * -// 2 -// // + 1, dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl - 1)], r_gl, -// // c_gl, f_gl - 1); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2 + 1)] = -// dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl - 1)]; -// } else { -// // if (debug) printf("load right vsm[%d]: 0\n", f_sm * 2 + 1); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2 + 1)] = 0.0; -// } -// } else { // nf_c % 2 == 0 -// if (f_gl < nf_c - 2) { -// // if (debug) printf("load right vsm[%d]: %f <- %d %d %d\n", f_sm * -// 2 -// // + 3, dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl)], r_gl, c_gl, -// // f_gl); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2 + 3)] = -// dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl)]; -// } else { -// // if (debug) printf("load right vsm[%d]: 0\n", f_sm * 2 + 3); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2 + 3)] = 0.0; -// } -// } -// } - -// if (f_sm == actual_F - 1) { -// // right (+1) -// if (!PADDING) { -// if (nf_c % 2 != 0) { -// if (f_gl < nf_c - 1) { -// // if (debug) printf("load right+1 vsm[%d]: %f <- %d %d %d\n", -// // actual_F * 2 + 1, dv2[get_idx(lddv21, lddv22, r_gl, c_gl, -// f_gl)], -// // r_gl, c_gl, f_gl); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, actual_F * 2 + 1)] = -// dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl)]; -// } else { -// // if (debug) printf("load right+1 vsm[%d]: 0.0\n", actual_F * 2 -// + -// // 1); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, actual_F * 2 + 1)] = 0.0; -// } -// } else { // nf_c % 2 == 0 -// if (f_gl >= actual_F) { -// // if (debug) printf("load right-1 vsm[1]: %f <- %d %d %d\n", -// // dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl - actual_F)], -// r_gl, -// // c_gl, f_gl - actual_F); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 1)] = -// dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl - actual_F)]; -// } else { -// // if (debug) printf("load right-1 vsm[1]: 0.0\n"); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 1)] = 0.0; -// } -// } -// } else { -// if (nf_c % 2 != 0) { -// if (f_gl < nf_c - 2) { -// // if (debug) printf("actual_F(%d), load right+1 vsm[%d]: %f <- -// %d -// // %d %d\n", actual_F, actual_F * 2 + 1, dv2[get_idx(lddv21, -// lddv22, -// // r_gl, c_gl, f_gl)], r_gl, c_gl, f_gl); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, actual_F * 2 + 1)] = -// dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl)]; -// } else { -// // if (debug) printf("load right+1 vsm[%d]: 0.0\n", actual_F * 2 -// + -// // 1); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, actual_F * 2 + 1)] = 0.0; -// } -// } else { // nf_c % 2 == 0 -// if (f_gl >= actual_F && f_gl - actual_F < nf_c - 2) { -// // if (debug) printf("load right-1 vsm[1]: %f <- %d %d %d\n", -// // dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl - actual_F)], -// r_gl, -// // c_gl, f_gl - actual_F); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 1)] = -// dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl - actual_F)]; -// } else { -// // if (debug) printf("load right-1 vsm[1]: 0.0\n"); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, 1)] = 0.0; -// } -// } -// } -// } -// } - -// bool debug = false; -// // if (r_gl == 0 && c_gl == 0) debug = true; - -// if (r_sm == 0 && c_sm == 0 && f_sm < actual_F) { -// if (blockId * F * 2 + f_sm < nf) { -// dist_f_sm[2 + f_sm] = ddist_f[blockId * F * 2 + f_sm]; -// ratio_f_sm[2 + f_sm] = dratio_f[blockId * F * 2 + f_sm]; -// if (debug) printf("load dist[%d] -> sm[%d]: %f\n", blockId * F * 2 + -// f_sm, 2 + f_sm, ddist_f[blockId * F * 2 + f_sm]); -// } else { -// dist_f_sm[2 + f_sm] = 0.0; -// ratio_f_sm[2 + f_sm] = 0.0; -// } - -// if (blockId * F * 2 + actual_F + f_sm < nf) { -// dist_f_sm[2 + actual_F + f_sm] = -// ddist_f[blockId * F * 2 + actual_F + f_sm]; -// ratio_f_sm[2 + actual_F + f_sm] = -// dratio_f[blockId * F * 2 + actual_F + f_sm]; -// if (debug) printf("load dist[%d] -> sm[%d]: %f\n", blockId * F * 2 + -// actual_F + f_sm, 2 + actual_F + f_sm, ddist_f[blockId * F * 2 + -// actual_F + f_sm]); -// } else { -// dist_f_sm[2 + actual_F + f_sm] = 0.0; -// ratio_f_sm[2 + actual_F + f_sm] = 0.0; -// } -// // dist_f_sm[2 + f_sm] = ddist_f[f_gl]; -// // dist_f_sm[2 + actual_F + f_sm] = ddist_f[actual_F + f_gl]; -// // ratio_f_sm[2 + f_sm] = dratio_f[f_gl]; -// // ratio_f_sm[2 + actual_F + f_sm] = dratio_f[actual_F + f_gl]; -// } - -// if (blockId > 0) { -// if (f_sm < 2) { -// // dist_f_sm[f_sm] = ddist_f[f_gl - 2]; -// // ratio_f_sm[f_sm] = dratio_f[f_gl - 2]; -// dist_f_sm[f_sm] = ddist_f[blockId * F * 2 + f_sm - 2]; -// ratio_f_sm[f_sm] = dratio_f[blockId * F * 2 + f_sm - 2]; -// } -// } else { -// if (f_sm < 2) { -// dist_f_sm[f_sm] = 0.0; -// ratio_f_sm[f_sm] = 0.0; -// } -// } - -// __syncthreads(); - -// if (r_gl < nr && c_gl < nc && f_gl < nf_c) { -// T h1 = dist_f_sm[f_sm * 2]; -// T h2 = dist_f_sm[f_sm * 2 + 1]; -// T h3 = dist_f_sm[f_sm * 2 + 2]; -// T h4 = dist_f_sm[f_sm * 2 + 3]; -// T r1 = ratio_f_sm[f_sm * 2]; -// T r2 = ratio_f_sm[f_sm * 2 + 1]; -// T r3 = ratio_f_sm[f_sm * 2 + 2]; -// T r4 = 1 - r3; -// T a = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2)]; -// T b = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2 + 1)]; -// T c = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2 + 2)]; -// T d = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2 + 3)]; -// T e = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm, f_sm * 2 + 4)]; - -// // if (f_gl == nf_c - 1) { -// // printf("f_sm(%d) %f %f %f %f %f\n",f_sm, a,b,c,d,e); -// // printf("f_sm_h(%d) %f %f %f %f\n",f_sm, h1,h2,h3,h4); -// // printf("f_sm_r(%d) %f %f %f %f\n",f_sm, r1,r2,r3,r4); -// // } - -// // T tb = a * h1 + b * 2 * (h1+h2) + c * h2; -// // T tc = b * h2 + c * 2 * (h2+h3) + d * h3; -// // T td = c * h3 + d * 2 * (h3+h4) + e * h4; - -// // if (debug) printf("f_sm(%d) tb tc td tc: %f %f %f %f\n", f_sm, tb, tc, -// // td, tc+tb * r1 + td * r4); - -// // tc += tb * r1 + td * r4; - -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] = -// mass_trans(a, b, c, d, e, h1, h2, h3, h4, r1, r2, r3, r4); - -// // if (debug) printf("store[%d %d %d] %f \n", r_gl, c_gl, f_gl, -// // mass_trans(a, b, c, d, e, h1, h2, h3, h4, r1, r2, r3, r4)); - -// // printf("test block %d F %d nf %d\n", blockId, F, nf); -// // if (f_gl+1 == nf_c-1) { - -// // // T te = h4 * d + 2 * h4 * e; -// // //printf("f_sm(%d) mm-e: %f\n", f_sm, te); -// // // te += td * r3; -// // dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl+1)] = -// // mass_trans(c, d, e, (T)0.0, (T)0.0, h1, h2, (T)0.0, (T)0.0, r1, -// r2, -// // (T)0.0, (T)0.0); -// // } -// } -// } - -// template -// void lpk_reo_1_3d_adaptive_launcher(Handle &handle, SIZE nr, SIZE nc, -// SIZE nf, SIZE nf_c, SIZE zero_r, SIZE -// zero_c, SIZE zero_f, T *ddist_f, T -// *dratio_f, T *dv1, SIZE lddv11, SIZE -// lddv12, T *dv2, SIZE lddv21, SIZE lddv22, -// T *dw, SIZE lddw1, SIZE lddw2, int -// queue_idx) { -// // printf("dratio_f: "); -// // print_matrix_cuda(1, (nf-1)*2, dratio_f, (nf-1)*2); -// SIZE total_thread_z = nr; -// SIZE total_thread_y = nc; -// SIZE total_thread_x = nf_c; -// // if (nf_c % 2 == 1) { total_thread_x = nf_c - 1; } -// // else { total_thread_x = nf; } -// SIZE tbx, tby, tbz, gridx, gridy, gridz; -// dim3 threadsPerBlock, blockPerGrid; -// size_t sm_size; - -// tbz = R; -// tby = C; -// tbx = F; -// sm_size = (R * C * (F * 2 + 3) + (F * 2 + 3) * 2) * sizeof(T); -// gridz = ceil((float)total_thread_z / tbz); -// gridy = ceil((float)total_thread_y / tby); -// gridx = ceil((float)total_thread_x / tbx); -// threadsPerBlock = dim3(tbx, tby, tbz); -// blockPerGrid = dim3(gridx, gridy, gridz); - -// // printf("nr: %d nc: %d, nf: %d, nf_c: %d\n", nr, nc, nf, nf_c); -// // printf("tb: %d %d %d, grid: %d %d %d\n", tbx, tby, tbz, gridx, gridy, -// // gridz); - -// _lpk_reo_1_3d<<>>( -// nr, nc, nf, nf_c, zero_r, zero_c, zero_f, ddist_f, dratio_f, dv1, -// lddv11, lddv12, dv2, lddv21, lddv22, dw, lddw1, lddw2); -// gpuErrchk(cudaGetLastError()); -// if (handle.sync_and_check_all_kernels) { -// gpuErrchk(cudaDeviceSynchronize()); -// } -// } - -// template -// void lpk_reo_1_3d(Handle &handle, SIZE nr, SIZE nc, SIZE nf, SIZE nf_c, -// SIZE zero_r, SIZE zero_c, SIZE zero_f, T *ddist_f, T -// *dratio_f, T *dv1, SIZE lddv11, SIZE lddv12, T *dv2, SIZE -// lddv21, SIZE lddv22, T *dw, SIZE lddw1, SIZE lddw2, int -// queue_idx, int config) { - -// #define LPK(R, C, F) \ -// { \ -// lpk_reo_1_3d_adaptive_launcher( \ -// handle, nr, nc, nf, nf_c, zero_r, zero_c, zero_f, ddist_f, dratio_f, \ -// dv1, lddv11, lddv12, dv2, lddv21, lddv22, dw, lddw1, lddw2, \ -// queue_idx); \ -// } - -// bool profile = false; -// if (handle.profile_kernels) { -// profile = true; -// } -// if (D == 3) { -// if (profile || config == 6) { -// LPK(2, 2, 128) -// } -// if (profile || config == 5) { -// LPK(2, 2, 64) -// } -// if (profile || config == 4) { -// LPK(2, 2, 32) -// } -// if (profile || config == 3) { -// LPK(4, 4, 16) -// } -// if (profile || config == 2) { -// LPK(8, 8, 8) -// } -// if (profile || config == 1) { -// LPK(4, 4, 4) -// } -// if (profile || config == 0) { -// LPK(2, 2, 2) -// } -// } else if (D == 2) { -// if (profile || config == 6) { -// LPK(1, 2, 128) -// } -// if (profile || config == 5) { -// LPK(1, 2, 64) -// } -// if (profile || config == 4) { -// LPK(1, 2, 32) -// } -// if (profile || config == 3) { -// LPK(1, 4, 16) -// } -// if (profile || config == 2) { -// LPK(1, 8, 8) -// } -// if (profile || config == 1) { -// LPK(1, 4, 4) -// } -// if (profile || config == 0) { -// LPK(1, 2, 4) -// } -// } else if (D == 1) { -// if (profile || config == 6) { -// LPK(1, 1, 128) -// } -// if (profile || config == 5) { -// LPK(1, 1, 64) -// } -// if (profile || config == 4) { -// LPK(1, 1, 32) -// } -// if (profile || config == 3) { -// LPK(1, 1, 16) -// } -// if (profile || config == 2) { -// LPK(1, 1, 8) -// } -// if (profile || config == 1) { -// LPK(1, 1, 8) -// } -// if (profile || config == 0) { -// LPK(1, 1, 8) -// } -// } -// #undef LPK -// } - -// template -// __global__ void _lpk_reo_2_3d(SIZE nr, SIZE nc, SIZE nf_c, SIZE nc_c, T -// *ddist_c, -// T *dratio_c, T *dv1, SIZE lddv11, SIZE lddv12, -// T *dv2, SIZE lddv21, SIZE lddv22, T *dw, SIZE -// lddw1, SIZE lddw2) { - -// // bool debug = false; -// // if (blockIdx.y == gridDim.y-1 && blockIdx.x == 0 && -// // threadIdx.x == 0 ) debug = false; - -// // bool debug2 = false; -// // if (blockIdx.z == gridDim.z-1 && blockIdx.y == 1 && blockIdx.x == 16) -// // debug2 = false; - -// bool PADDING = (nc % 2 == 0); - -// T *sm = SharedMemory(); - -// // extern __shared__ double sm[]; // size: (blockDim.x + 1) * (blockDim.y + -// 1) -// // * (blockDim.z + 1) -// SIZE ldsm1 = F; -// SIZE ldsm2 = C * 2 + 3; -// T *v_sm = sm; -// T *dist_c_sm = sm + ldsm1 * ldsm2 * R; -// T *ratio_c_sm = dist_c_sm + ldsm2; - -// // bool debug = false; -// // if (blockIdx.z == 0 && blockIdx.y == 0 && blockIdx.x == 0 && -// // threadIdx.z == 0 && threadIdx.x == 0 ) debug = false; - -// SIZE r_gl = blockIdx.z * blockDim.z + threadIdx.z; -// SIZE c_gl = blockIdx.y * blockDim.y + threadIdx.y; -// SIZE f_gl = blockIdx.x * blockDim.x + threadIdx.x; - -// SIZE blockId = blockIdx.y; - -// SIZE r_sm = threadIdx.z; -// SIZE c_sm = threadIdx.y; -// SIZE f_sm = threadIdx.x; - -// SIZE actual_C = C; -// if (nc_c - blockIdx.y * blockDim.y < C) { -// actual_C = nc_c - blockIdx.y * blockDim.y; -// } - -// // if (nc_c % 2 == 1){ -// // if(nc_c-1 - blockIdx.y * blockDim.y < C) { actual_C = nc_c - 1 - -// // blockIdx.y * blockDim.y; } -// // } else { -// // if(nc_c - blockIdx.y * blockDim.y < C) { actual_C = nc_c - blockIdx.y -// * -// // blockDim.y; } -// // } - -// // if (debug) printf("actual_C %d\n", actual_C); - -// if (r_gl < nr && c_gl < nc_c && f_gl < nf_c) { -// // if (debug) printf("load up vsm[%d]: %f <- %d %d %d\n", c_sm * 2 + 2, -// // dv1[get_idx(lddv11, lddv12, r_gl, c_gl, f_gl)], r_gl, c_gl, f_gl); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm * 2 + 2, f_sm)] = -// dv1[get_idx(lddv11, lddv12, r_gl, c_gl, f_gl)]; - -// if (c_sm == actual_C - 1) { -// if (c_gl + 1 < nc_c) { -// // if (debug) printf("load up+1 vsm[%d]: %f <- %d %d %d\n", actual_C -// * 2 -// // + 2, dv1[get_idx(lddv11, lddv12, r_gl, blockId * C + actual_C, -// // f_gl)], r_gl, blockId * C + actual_C, f_gl); -// // c_gl+1 == blockId * C + C -// v_sm[get_idx(ldsm1, ldsm2, r_sm, actual_C * 2 + 2, f_sm)] = -// dv1[get_idx(lddv11, lddv12, r_gl, c_gl + 1, f_gl)]; -// } else { -// // if (debug) printf("load up+1 vsm[%d]: 0.0\n", actual_C * 2 + 2); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, actual_C * 2 + 2, f_sm)] = 0.0; -// } -// } - -// if (c_sm == 0) { -// if (c_gl >= 1) { -// // if (debug) printf("load up-1 vsm[0]: %f <- %d %d %d\n", -// // dv1[get_idx(lddv11, lddv12, r_gl, c_gl-1, f_gl)], r_gl, c_gl-1, -// // f_gl); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)] = -// dv1[get_idx(lddv11, lddv12, r_gl, c_gl - 1, f_gl)]; -// } else { -// // if (debug) printf("load up-1 vsm[0]: 0.0\n"); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, 0, f_sm)] = 0.0; -// } -// } - -// if (!PADDING) { -// if (c_gl < nc_c - 1) { -// // if (debug) printf("load down vsm[%d]: %f <- %d %d %d\n", c_sm * 2 -// + -// // 3, dv2[get_idx(lddv11, lddv12, r_gl, c_gl, f_gl)], r_gl, c_gl, -// f_gl); v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm * 2 + 3, f_sm)] = -// dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl)]; -// } else { -// // if (debug) printf("load down vsm[%d]: 0.0\n", c_sm * 2 + 3); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm * 2 + 3, f_sm)] = 0.0; -// } -// } else { -// if (c_gl < nc_c - 2) { -// // if (debug) printf("load down vsm[%d]: %f <- %d %d %d\n", c_sm * 2 -// + -// // 3, dv2[get_idx(lddv11, lddv12, r_gl, c_gl, f_gl)], r_gl, c_gl, -// f_gl); v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm * 2 + 3, f_sm)] = -// dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl)]; -// } else { -// // if (debug) printf("load down vsm[%d]: 0.0\n", c_sm * 2 + 3); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm * 2 + 3, f_sm)] = 0.0; -// } -// } - -// if (c_gl >= 1 && -// (PADDING && c_gl - 1 < nc_c - 2 || !PADDING && c_gl - 1 < nc_c - 1)) -// { -// if (c_sm == 0) { -// // if (debug) printf("PADDING: %d, c_gl-1: %d nc_c-2: %d\n", PADDING, -// // c_gl-1, nc_c - 2); if (debug) printf("load down-1 vsm[1]: %f <- %d -// %d -// // %d\n", dv2[get_idx(lddv11, lddv12, r_gl, c_gl-1, f_gl)], r_gl, -// // c_gl-1, f_gl); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, 1, f_sm)] = -// dv2[get_idx(lddv11, lddv12, r_gl, c_gl - 1, f_gl)]; -// } -// } else { -// if (c_sm == 0) { -// // if (debug) printf("load down-1 vsm[1]: 0.0\n"); -// v_sm[get_idx(ldsm1, ldsm2, r_sm, 1, f_sm)] = 0.0; -// } -// } - -// } - -// // load dist/ratio using f_sm for better performance -// // assumption F >= C -// if (r_sm == 0 && c_sm == 0 && f_sm < actual_C) { -// if (blockId * C * 2 + f_sm < nc) { -// dist_c_sm[2 + f_sm] = ddist_c[blockId * C * 2 + f_sm]; -// ratio_c_sm[2 + f_sm] = dratio_c[blockId * C * 2 + f_sm]; -// } else { -// dist_c_sm[2 + f_sm] = 0.0; -// ratio_c_sm[2 + f_sm] = 0.0; -// } - -// if (blockId * C * 2 + actual_C + f_sm < nc) { -// dist_c_sm[2 + actual_C + f_sm] = -// ddist_c[blockId * C * 2 + actual_C + f_sm]; -// ratio_c_sm[2 + actual_C + f_sm] = -// dratio_c[blockId * C * 2 + actual_C + f_sm]; -// } else { -// dist_c_sm[2 + actual_C + f_sm] = 0.0; -// ratio_c_sm[2 + actual_C + f_sm] = 0.0; -// } -// } - -// if (blockId > 0) { -// if (f_sm < 2) { -// dist_c_sm[f_sm] = ddist_c[blockId * C * 2 - 2 + f_sm]; -// ratio_c_sm[f_sm] = dratio_c[blockId * C * 2 - 2 + f_sm]; -// } -// } else { -// if (f_sm < 2) { -// dist_c_sm[f_sm] = 0.0; -// ratio_c_sm[f_sm] = 0.0; -// } -// } - -// __syncthreads(); - -// if (r_gl < nr && c_gl < nc_c && f_gl < nf_c) { -// T h1 = dist_c_sm[c_sm * 2]; -// T h2 = dist_c_sm[c_sm * 2 + 1]; -// T h3 = dist_c_sm[c_sm * 2 + 2]; -// T h4 = dist_c_sm[c_sm * 2 + 3]; -// T r1 = ratio_c_sm[c_sm * 2]; -// T r2 = ratio_c_sm[c_sm * 2 + 1]; -// T r3 = ratio_c_sm[c_sm * 2 + 2]; -// T r4 = 1 - r3; -// T a = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm * 2, f_sm)]; -// T b = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm * 2 + 1, f_sm)]; -// T c = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm * 2 + 2, f_sm)]; -// T d = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm * 2 + 3, f_sm)]; -// T e = v_sm[get_idx(ldsm1, ldsm2, r_sm, c_sm * 2 + 4, f_sm)]; - -// // if (debug) { -// // printf("c_sm(%d) %f %f %f %f %f\n",c_sm, a,b,c,d,e); -// // printf("c_sm_h(%d) %f %f %f %f\n",c_sm, h1,h2,h3,h4); -// // printf("c_sm_r(%d) %f %f %f %f\n",c_sm, r1,r2,r3,r4); -// // } - -// // T tb = a * h1 + b * 2 * (h1+h2) + c * h2; -// // T tc = b * h2 + c * 2 * (h2+h3) + d * h3; -// // T td = c * h3 + d * 2 * (h3+h4) + e * h4; - -// // if (debug) printf("c_sm(%d) tb tc td tc: %f %f %f %f\n", f_sm, tb, tc, -// // td, tc+tb * r1 + td * r4); - -// // tc += tb * r1 + td * r4; - -// // if (r_gl == 0 && f_gl == 0 && r_sm == 0 && f_sm == 0) { -// // printf("mr2(%d) mm2: %f -> (%d %d %d)\n", c_sm, tc, r_gl, c_gl, -// f_gl); -// // // printf("f_sm(%d) b c d: %f %f %f\n", f_sm, tb, tc, td); -// // } - -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] = -// mass_trans(a, b, c, d, e, h1, h2, h3, h4, r1, r2, r3, r4); - -// // if (debug) printf("store[%d %d %d] %f \n", r_gl, c_gl, f_gl, -// // mass_trans(a, b, c, d, e, h1, h2, h3, h4, r1, r2, r3, r4)); - -// // printf("%d %d %d\n", r_gl, c_gl, f_gl); -// // if (blockId * C + C == nc-1) { -// // if (c_gl + 1 == nc_c - 1) { -// // // T te = h4 * d + 2 * h4 * e; -// // // te += td * r3; -// // dw[get_idx(lddw1, lddw2, r_gl, blockId * C + actual_C, f_gl)] = -// // mass_trans(c, d, e, (T)0.0, (T)0.0, -// // h1, h2, (T)0.0, (T)0.0, r1, r2, (T)0.0, (T)0.0); -// // } -// // } - -// } - -// } - -// template -// void lpk_reo_2_3d_adaptive_launcher(Handle &handle, SIZE nr, SIZE nc, -// SIZE nf_c, SIZE nc_c, T *ddist_c, T -// *dratio_c, T *dv1, SIZE lddv11, SIZE -// lddv12, T *dv2, SIZE lddv21, SIZE lddv22, -// T *dw, SIZE lddw1, SIZE lddw2, int -// queue_idx) { -// cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte); -// cudaDeviceSetCacheConfig(cudaFuncCachePreferShared); -// SIZE total_thread_z = nr; -// SIZE total_thread_y = nc_c; -// SIZE total_thread_x = nf_c; -// SIZE tbx, tby, tbz, gridx, gridy, gridz; -// dim3 threadsPerBlock, blockPerGrid; -// size_t sm_size; - -// tbz = R; -// tby = C; -// tbx = F; -// sm_size = (R * (C * 2 + 3) * F + (C * 2 + 3) * 2) * sizeof(T); -// gridz = ceil((float)total_thread_z / tbz); -// gridy = ceil((float)total_thread_y / tby); -// gridx = ceil((float)total_thread_x / tbx); -// threadsPerBlock = dim3(tbx, tby, tbz); -// blockPerGrid = dim3(gridx, gridy, gridz); - -// // printf("nr: %d nc: %d, nf_c: %d, nc_c: %d\n", nr, nc, nf_c, nc_c); -// // printf("tb: %d %d %d, grid: %d %d %d\n", tbx, tby, tbz, gridx, gridy, -// // gridz); - -// _lpk_reo_2_3d<<>>( -// nr, nc, nf_c, nc_c, ddist_c, dratio_c, dv1, lddv11, lddv12, dv2, -// lddv21, lddv22, dw, lddw1, lddw2); -// gpuErrchk(cudaGetLastError()); -// if (handle.sync_and_check_all_kernels) { -// gpuErrchk(cudaDeviceSynchronize()); -// } -// } - -// template -// void lpk_reo_2_3d(Handle &handle, SIZE nr, SIZE nc, SIZE nf_c, SIZE -// nc_c, -// T *ddist_c, T *dratio_c, T *dv1, SIZE lddv11, SIZE lddv12, -// T *dv2, SIZE lddv21, SIZE lddv22, T *dw, SIZE lddw1, SIZE -// lddw2, int queue_idx, int config) { - -// #define LPK(R, C, F) \ -// { \ -// lpk_reo_2_3d_adaptive_launcher( \ -// handle, nr, nc, nf_c, nc_c, ddist_c, dratio_c, dv1, lddv11, lddv12, \ -// dv2, lddv21, lddv22, dw, lddw1, lddw2, queue_idx); \ -// } - -// bool profile = false; -// if (handle.profile_kernels) { -// profile = true; -// } -// if (D == 3) { -// if (profile || config == 6) { -// LPK(2, 2, 128) -// } -// if (profile || config == 5) { -// LPK(2, 2, 64) -// } -// if (profile || config == 4) { -// LPK(2, 2, 32) -// } -// if (profile || config == 3) { -// LPK(4, 4, 16) -// } -// if (profile || config == 2) { -// LPK(8, 8, 8) -// } -// if (profile || config == 1) { -// LPK(4, 4, 4) -// } -// if (profile || config == 0) { -// LPK(2, 2, 2) -// } -// } else if (D == 2) { -// if (profile || config == 6) { -// LPK(1, 2, 128) -// } -// if (profile || config == 5) { -// LPK(1, 2, 64) -// } -// if (profile || config == 4) { -// LPK(1, 2, 32) -// } -// if (profile || config == 3) { -// LPK(1, 4, 16) -// } -// if (profile || config == 2) { -// LPK(1, 8, 8) -// } -// if (profile || config == 1) { -// LPK(1, 4, 4) -// } -// if (profile || config == 0) { -// LPK(1, 2, 4) -// } -// } else { -// printf("Error: mass_trans_multiply_2_cpt is only for 3D and 2D data\n"); -// } -// #undef LPK -// } - -// template -// __global__ void _lpk_reo_3_3d(SIZE nr, SIZE nc_c, SIZE nf_c, SIZE nr_c, T -// *ddist_r, -// T *dratio_r, T *dv1, SIZE lddv11, SIZE lddv12, -// T *dv2, SIZE lddv21, SIZE lddv22, T *dw, SIZE -// lddw1, SIZE lddw2) { - -// // bool debug = false; -// // if (blockIdx.z == 0 && blockIdx.y == 0 && blockIdx.x == 0 && -// // threadIdx.y == 0 && threadIdx.x == 0 ) debug = true; - -// // bool debug2 = false; -// // if (blockIdx.z == 0 && blockIdx.y == 0 && blockIdx.x == 0) -// // debug2 = true; - -// bool PADDING = (nr % 2 == 0); -// T *sm = SharedMemory(); -// SIZE ldsm1 = F; -// SIZE ldsm2 = C; -// T *v_sm = sm; -// T *dist_r_sm = sm + ldsm1 * ldsm2 * (R * 2 + 3); -// T *ratio_r_sm = dist_r_sm + (R * 2 + 3); - -// SIZE r_gl = blockIdx.z * blockDim.z + threadIdx.z; -// SIZE c_gl = blockIdx.y * blockDim.y + threadIdx.y; -// SIZE f_gl = blockIdx.x * blockDim.x + threadIdx.x; - -// // if (debug) printf("debugging gl: %d %d %d\n", r_gl, c_gl, f_gl); - -// SIZE blockId = blockIdx.z; - -// SIZE r_sm = threadIdx.z; -// SIZE c_sm = threadIdx.y; -// SIZE f_sm = threadIdx.x; - -// SIZE actual_R = R; -// if (nr_c - blockIdx.z * blockDim.z < R) { -// actual_R = nr_c - blockIdx.z * blockDim.z; -// } -// // if (nr_c % 2 == 1){ -// // if(nr_c-1 - blockIdx.z * blockDim.z < R) { actual_R = nr_c - 1 - -// // blockIdx.z * blockDim.z; } -// // } else { -// // if(nr_c - blockIdx.z * blockDim.z < R) { actual_R = nr_c - blockIdx.z -// * -// // blockDim.z; } -// // } - -// // if (debug) printf("actual_R %d\n", actual_R); - -// // if (debug) printf("RCF: %d %d %d\n", R, C, F); -// if (r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) { -// // if (debug) printf("load front vsm[%d]: %f <- %d %d %d\n", r_sm * 2 + -// 2, -// // dv1[get_idx(lddv11, lddv12, r_gl, c_gl, f_gl)], r_gl, c_gl, f_gl); -// v_sm[get_idx(ldsm1, ldsm2, r_sm * 2 + 2, c_sm, f_sm)] = -// dv1[get_idx(lddv11, lddv12, r_gl, c_gl, f_gl)]; - -// if (r_sm == actual_R - 1) { -// if (r_gl + 1 < nr_c) { -// // if (debug) printf("load front+1 vsm[%d]: %f <- %d %d %d\n", -// actual_R -// // * 2 + 2, dv1[get_idx(lddv11, lddv12, blockId * R + actual_R, c_gl, -// // f_gl)], blockId * R + actual_R, c_gl, f_gl); -// v_sm[get_idx(ldsm1, ldsm2, actual_R * 2 + 2, c_sm, f_sm)] = -// dv1[get_idx(lddv11, lddv12, r_gl + 1, c_gl, f_gl)]; -// } else { -// // if (debug) printf("load front+1 vsm[%d]: 0.0\n", actual_R * 2 + -// 2); v_sm[get_idx(ldsm1, ldsm2, actual_R * 2 + 2, c_sm, f_sm)] = 0.0; -// } -// } - -// if (r_sm == 0) { -// if (r_gl >= 1) { -// // if (debug) printf("load front-1 vsm[0]: %f <- %d %d %d\n", -// // dv1[get_idx(lddv11, lddv12, r_gl-1, c_gl, f_gl)], r_gl-1, c_gl, -// // f_gl); -// v_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] = -// dv1[get_idx(lddv11, lddv12, r_gl - 1, c_gl, f_gl)]; -// } else { -// // if (debug) printf("load front-1 vsm[0]: 0.0\n"); -// v_sm[get_idx(ldsm1, ldsm2, 0, c_sm, f_sm)] = 0.0; -// } -// } - -// if (!PADDING) { -// if (r_gl < nr_c - 1) { -// // if (debug) printf("load back vsm[%d]: %f <- %d %d %d\n", r_sm * 2 -// + -// // 3, dv2[get_idx(lddv11, lddv12, r_gl, c_gl, f_gl)], r_gl, c_gl, -// f_gl); v_sm[get_idx(ldsm1, ldsm2, r_sm * 2 + 3, c_sm, f_sm)] = -// dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl)]; -// } else { -// // if (debug) printf("load back vsm[%d]: 0.0\n", r_sm * 2 + 3); -// v_sm[get_idx(ldsm1, ldsm2, r_sm * 2 + 3, c_sm, f_sm)] = 0.0; -// } -// } else { -// if (r_gl < nr_c - 2) { -// // if (debug) printf("load back vsm[%d]: %f <- %d %d %d\n", r_sm * 2 -// + -// // 3, dv2[get_idx(lddv11, lddv12, r_gl, c_gl, f_gl)], r_gl, c_gl, -// f_gl); v_sm[get_idx(ldsm1, ldsm2, r_sm * 2 + 3, c_sm, f_sm)] = -// dv2[get_idx(lddv21, lddv22, r_gl, c_gl, f_gl)]; -// } else { -// // if (debug) printf("load back vsm[%d]: 0.0\n", r_sm * 2 + 3); -// v_sm[get_idx(ldsm1, ldsm2, r_sm * 2 + 3, c_sm, f_sm)] = 0.0; -// } -// } - -// if (r_gl >= 1 && -// (PADDING && r_gl - 1 < nr_c - 2 || !PADDING && r_gl < nr_c )) { -// // if (blockId > 0) { -// if (r_sm == 0) { -// // if (debug) printf("load back-1 vsm[1]: %f <- %d %d %d\n", -// // dv2[get_idx(lddv11, lddv12, r_gl-1, c_gl, f_gl)], r_gl-1, c_gl, -// // f_gl); -// v_sm[get_idx(ldsm1, ldsm2, 1, c_sm, f_sm)] = -// dv2[get_idx(lddv11, lddv12, r_gl - 1, c_gl, f_gl)]; -// } -// } else { -// if (r_sm == 0) { -// // if (debug) printf("load back-1 vsm[1]: 0.0\n"); -// v_sm[get_idx(ldsm1, ldsm2, 1, c_sm, f_sm)] = 0.0; -// } -// } -// } - -// // load dist/ratio using f_sm for better performance -// // assumption F >= R -// // if (debug2) printf("actual_R: %u\n", actual_R); -// if (r_sm == 0 && c_sm == 0 && f_sm < actual_R) { -// // if (debug2) printf(" RCF (%u %u %u)blockid(%u) fsm(%u) nr(%u)\n", R, -// C, F, blockId, blockId * R * 2 + f_sm, nr); if (blockId * R * 2 + f_sm < -// nr) { - -// dist_r_sm[2 + f_sm] = ddist_r[blockId * R * 2 + f_sm]; -// // if (debug2 ) printf("load dist 1 [%d]: %f [%d]\n", 2 + f_sm, -// // dist_r_sm[2 + f_sm], blockId * R * 2 + f_sm); -// ratio_r_sm[2 + f_sm] = dratio_r[blockId * R * 2 + f_sm]; -// // if (debug2 )printf("load ratio 1 [%d]: %f [%d]\n", 2 + f_sm, -// // ratio_r_sm[2 + f_sm], blockId * R * 2 + f_sm); -// } else { -// dist_r_sm[2 + f_sm] = 0.0; -// ratio_r_sm[2 + f_sm] = 0.0; -// } -// if (blockId * R * 2 + actual_R + f_sm < nr) { -// dist_r_sm[2 + actual_R + f_sm] = -// ddist_r[blockId * R * 2 + actual_R + f_sm]; -// // if (debug2 )printf("load dist 2 [%d]: %f [%d]\n", 2 + actual_R + -// f_sm, -// // dist_r_sm[2 + actual_R + f_sm], blockId * R * 2 + actual_R + f_sm); -// ratio_r_sm[2 + actual_R + f_sm] = -// dratio_r[blockId * R * 2 + actual_R + f_sm]; -// // if (debug2 )printf("load ratio 2 [%d]: %f [%d]\n", 2 + actual_R + -// f_sm, -// // ratio_r_sm[2 + actual_R + f_sm], blockId * R * 2 + actual_R + f_sm); -// } else { -// dist_r_sm[2 + actual_R + f_sm] = 0.0; -// ratio_r_sm[2 + actual_R + f_sm] = 0.0; -// } -// } - -// if (blockId > 0) { -// if (f_sm < 2) { -// dist_r_sm[f_sm] = ddist_r[blockId * R * 2 - 2 + f_sm]; -// // if (debug2 )printf("load dist -1 [%d]: %f [%d]\n", f_sm, -// // dist_r_sm[f_sm], blockId * R * 2 - 2 + f_sm); -// ratio_r_sm[f_sm] = dratio_r[blockId * R * 2 - 2 + f_sm]; -// // if (debug2 )printf("load ratio -1 [%d]: %f [%d]\n", f_sm, -// // ratio_r_sm[f_sm], blockId * R * 2 - 2 + f_sm); -// } -// } else { -// if (f_sm < 2) { -// dist_r_sm[f_sm] = 0.0; -// ratio_r_sm[f_sm] = 0.0; -// } -// } - -// __syncthreads(); - -// int adjusted_nr_c = nr_c; -// if (r_gl < nr_c && c_gl < nc_c && f_gl < nf_c) { -// T h1 = dist_r_sm[r_sm * 2]; -// T h2 = dist_r_sm[r_sm * 2 + 1]; -// T h3 = dist_r_sm[r_sm * 2 + 2]; -// T h4 = dist_r_sm[r_sm * 2 + 3]; -// T r1 = ratio_r_sm[r_sm * 2]; -// T r2 = ratio_r_sm[r_sm * 2 + 1]; -// T r3 = ratio_r_sm[r_sm * 2 + 2]; -// T r4 = 1 - r3; -// T a = v_sm[get_idx(ldsm1, ldsm2, r_sm * 2, c_sm, f_sm)]; -// T b = v_sm[get_idx(ldsm1, ldsm2, r_sm * 2 + 1, c_sm, f_sm)]; -// T c = v_sm[get_idx(ldsm1, ldsm2, r_sm * 2 + 2, c_sm, f_sm)]; -// T d = v_sm[get_idx(ldsm1, ldsm2, r_sm * 2 + 3, c_sm, f_sm)]; -// T e = v_sm[get_idx(ldsm1, ldsm2, r_sm * 2 + 4, c_sm, f_sm)]; - -// // __syncthreads(); -// // if (debug) { -// // printf("r_sm(%d) %f %f %f %f %f\n",r_sm, a,b,c,d,e); -// // printf("r_sm_h(%d) %f %f %f %f\n",r_sm, h1,h2,h3,h4); -// // printf("r_sm_r(%d) %f %f %f %f\n",r_sm, r1,r2,r3,r4); -// // } -// // __syncthreads(); - -// // T tb = a * h1/6 + b * 2 * (h1+h2)/6 + c * h2/6; -// // T tc = b * h2/6 + c * 2 * (h2+h3)/6 + d * h3/6; -// // T td = c * h3/6 + d * 2 * (h3+h4)/6 + e * h4/6; - -// // if (debug) printf("f_sm(%d) tb tc td tc: %f %f %f %f\n", f_sm, tb, tc, -// // td, tc+tb * r1 + td * r4); - -// // tc += tb * r1 + td * r4; - -// dw[get_idx(lddw1, lddw2, r_gl, c_gl, f_gl)] = -// mass_trans(a, b, c, d, e, h1, h2, h3, h4, r1, r2, r3, r4); - -// // if (debug) printf("store[%d %d %d] %f (%f)\n", r_gl, c_gl, f_gl, -// // mass_trans(a, b, c, d, e, h1, h2, h3, h4, r1, r2, r3, r4), -// // mass_trans(a, b, c, (T)0.0, (T)0.0, h1, (T)0.0, -// (T)0.0, -// // h4, r1, r2, (T)0.0, (T)0.0)); -// // // printf("%d %d %d\n", r_gl, c_gl, f_gl); -// // if (blockId * R + R == nr-1) { -// // if (r_gl+1 == nr_c - 1) { -// // if (r_gl+1 == nr_c - 1) { -// // // T te = h4 * d + 2 * h4 * e; -// // // te += td * r3; -// // dw[get_idx(lddw1, lddw2, blockId * R + actual_R, c_gl, f_gl)] = -// // mass_trans(c, d, e, (T)0.0, (T)0.0, -// // h1, h2, (T)0.0, (T)0.0, r1, r2, (T)0.0, (T)0.0); - -// // if (debug) printf("store-last[%d %d %d] %f\n", blockId * R + -// actual_R, -// // c_gl, f_gl, -// // mass_trans(c, d, e, (T)0.0, (T)0.0, -// // h1, h2, (T)0.0, (T)0.0, r1, r2, (T)0.0, (T)0.0)); -// // } -// //} -// } -// } - -// template -// void lpk_reo_3_3d_adaptive_launcher(Handle &handle, SIZE nr, SIZE nc_c, -// SIZE nf_c, SIZE nr_c, T *ddist_r, T -// *dratio_r, T *dv1, SIZE lddv11, SIZE -// lddv12, T *dv2, SIZE lddv21, SIZE lddv22, -// T *dw, SIZE lddw1, SIZE lddw2, int -// queue_idx) { - -// SIZE total_thread_z = nr_c; -// // if (nr_c % 2 == 1){ total_thread_z = nr_c - 1; } -// // else { total_thread_z = nr_c; } -// SIZE total_thread_y = nc_c; -// SIZE total_thread_x = nf_c; - -// SIZE tbx, tby, tbz, gridx, gridy, gridz; -// dim3 threadsPerBlock, blockPerGrid; -// size_t sm_size; - -// tbz = R; -// tby = C; -// tbx = F; -// sm_size = ((R * 2 + 3) * C * F + (R * 2 + 3) * 2) * sizeof(T); -// gridz = ceil((float)total_thread_z / tbz); -// gridy = ceil((float)total_thread_y / tby); -// gridx = ceil((float)total_thread_x / tbx); -// threadsPerBlock = dim3(tbx, tby, tbz); -// blockPerGrid = dim3(gridx, gridy, gridz); - -// // printf("nr: %d nc_c: %d, nf_c: %d, nr_c: %d\n", nr, nc_c, nf_c, nr_c); -// // printf("tb: %d %d %d, grid: %d %d %d\n", tbx, tby, tbz, gridx, gridy, -// // gridz); -// _lpk_reo_3_3d<<>>( -// nr, nc_c, nf_c, nr_c, ddist_r, dratio_r, dv1, lddv11, lddv12, dv2, -// lddv21, lddv22, dw, lddw1, lddw2); -// gpuErrchk(cudaGetLastError()); -// if (handle.sync_and_check_all_kernels) { -// gpuErrchk(cudaDeviceSynchronize()); -// } -// } - -// template -// void lpk_reo_3_3d(Handle &handle, SIZE nr, SIZE nc_c, SIZE nf_c, SIZE -// nr_c, -// T *ddist_r, T *dratio_r, T *dv1, SIZE lddv11, SIZE lddv12, -// T *dv2, SIZE lddv21, SIZE lddv22, T *dw, SIZE lddw1, SIZE -// lddw2, int queue_idx, int config) { - -// #define LPK(R, C, F) \ -// { \ -// lpk_reo_3_3d_adaptive_launcher( \ -// handle, nr, nc_c, nf_c, nr_c, ddist_r, dratio_r, dv1, lddv11, lddv12, \ -// dv2, lddv21, lddv22, dw, lddw1, lddw2, queue_idx); \ -// } -// bool profile = false; -// if (handle.profile_kernels) { -// profile = true; -// } -// if (D == 3) { -// if (profile || config == 6) { -// LPK(2, 2, 128) -// } -// if (profile || config == 5) { -// LPK(2, 2, 64) -// } -// if (profile || config == 4) { -// LPK(2, 2, 32) -// } -// if (profile || config == 3) { -// LPK(4, 4, 16) -// } -// if (profile || config == 2) { -// LPK(8, 8, 8) -// } -// if (profile || config == 1) { -// LPK(4, 4, 4) -// } -// if (profile || config == 0) { -// LPK(2, 2, 2) -// } -// } else { -// printf("Error: mass_trans_multiply_3_cpt is only for 3D data\n"); -// } - -// #undef LPK -// } - } // namespace mgard_x #endif \ No newline at end of file diff --git a/include/mgard-x/DataRefactoring/MultiDimension/DataRefactoring.h b/include/mgard-x/DataRefactoring/MultiDimension/DataRefactoring.h index d422509906..c59b246d2a 100644 --- a/include/mgard-x/DataRefactoring/MultiDimension/DataRefactoring.h +++ b/include/mgard-x/DataRefactoring/MultiDimension/DataRefactoring.h @@ -9,50 +9,64 @@ #define MGARD_X_DATA_REFACTORING // #include "Common.h" -#include "../../Hierarchy.h" +#include "../../Hierarchy/Hierarchy.h" #include "../../RuntimeX/RuntimeXPublic.h" namespace mgard_x { -// template -// void calc_coeff_pointers(Hierarchy &hierarchy, DIM curr_dims[3], DIM l, -// SubArray doutput, -// SubArray &dcoarse, -// SubArray &dcoeff_f, -// SubArray &dcoeff_c, -// SubArray &dcoeff_r, -// SubArray &dcoeff_cf, -// SubArray &dcoeff_rf, -// SubArray &dcoeff_rc, -// SubArray &dcoeff_rcf); - -// template -// void calc_coefficients_3d(Hierarchy &hierarchy, SubArray dinput, -// SubArray &doutput, SIZE l, int queue_idx); - -// template -// void coefficients_restore_3d(Hierarchy &hierarchy, SubArray -// dinput, -// SubArray &doutput, SIZE l, int queue_idx); - -// template -// void calc_correction_3d(Hierarchy &hierarchy, SubArray dcoeff, -// SubArray &dcorrection, SIZE l, int queue_idx); - -// template -// void calc_coefficients_nd(Hierarchy &hierarchy, SubArray dinput1, -// SubArray dinput2, -// SubArray &doutput, SIZE l, int queue_idx); - -// template -// void coefficients_restore_nd(Hierarchy &hierarchy, SubArray -// dinput1, -// SubArray dinput2, -// SubArray &doutput, SIZE l, int queue_idx); - -// template -// void calc_correction_nd(Hierarchy &hierarchy, SubArray dcoeff, -// SubArray &dcorrection, SIZE l, int queue_idx); +static bool multidim_refactoring_store = false; +static bool multidim_refactoring_verify = false; +static bool multidim_refactoring_debug_print = false; + +template +void CalcCoefficients3D(Hierarchy &hierarchy, + SubArray dinput, + SubArray &doutput, SIZE l, + int queue_idx); + +template +void CoefficientsRestore3D(Hierarchy &hierarchy, + SubArray dinput, + SubArray &doutput, SIZE l, + int queue_idx); + +template +void CalcCorrection3D(Hierarchy &hierarchy, + SubArray dcoeff, + SubArray &dcorrection, SIZE l, + int queue_idx); + +template +void CalcCoefficientsND(Hierarchy &hierarchy, + SubArray dinput1, + SubArray dinput2, + SubArray &doutput, SIZE l, + int queue_idx); + +template +void CoefficientsRestoreND(Hierarchy &hierarchy, + SubArray dinput1, + SubArray dinput2, + SubArray &doutput, SIZE l, + int queue_idx); + +template +void CalcCorrectionND(Hierarchy &hierarchy, + SubArray dcoeff, + SubArray &dcorrection, SIZE l, + int queue_idx); + +template +void CopyND(SubArray dinput, + SubArray &doutput, int queue_idx); + +template +void AddND(SubArray dinput, + SubArray &doutput, int queue_idx); + +template +void SubtractND(SubArray dinput, + SubArray &doutput, int queue_idx); template void decompose(Hierarchy &hierarchy, diff --git a/include/mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp b/include/mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp index d994ef7b21..8c299be04b 100644 --- a/include/mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp +++ b/include/mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp @@ -5,1684 +5,15 @@ * Date: March 17, 2022 */ -#include "../../Hierarchy.hpp" +#include "../../Hierarchy/Hierarchy.hpp" #include "../../RuntimeX/RuntimeX.h" -// #include "SubArray.hpp" -// #include "DeviceAdapters/DeviceAdapterCuda.h" - -// #include "DataRefactoring/Coefficient/GridProcessingKernel.h" -#include "Coefficient/GridProcessingKernel.hpp" -// #include "cuda/DataRefactoring/Coefficient/GridProcessingKernel2.hpp" - -// #include "DataRefactoring/Coefficient/GridProcessingKernel3D.h" -#include "Coefficient/GridProcessingKernel3D.hpp" -// #include "cuda/DataRefactoring/Coefficient/GridProcessingKernel2.hpp" -// #include "DataRefactoring/Correction/IterativeProcessingKernel.h" -// #include "DataRefactoring/Correction/IterativeProcessingKernel3D.h" -#include "Correction/IterativeProcessingKernel.hpp" -#include "Correction/IterativeProcessingKernel3D.hpp" -// #include "LevelwiseProcessingKernel.h" -#include "Correction/LevelwiseProcessingKernel.hpp" -// #include "DataRefactoring/Correction/LinearProcessingKernel.h" -#include "Correction/LinearProcessingKernel.hpp" -// #include "DataRefactoring/Correction/LinearProcessingKernel3D.h" -#include "Correction/LinearProcessingKernel3D.hpp" #include "DataRefactoring.h" -// #include "cuda/Testing/ReorderToolsGPU.hpp" - #include -#include namespace mgard_x { -static bool store = false; -static bool verify = false; -static bool debug_print = false; - -template -void CompareSubarray4D(SubArrayType subArray1, SubArrayType subArray2) { - if (SubArrayType::NumDims != 4) { - std::cout << log::log_err - << "CompareSubarray4D expects 4D subarray type.\n"; - exit(-1); - } - if (subArray1.getShape(3) != subArray2.getShape(3)) { - std::cout << log::log_err << "CompareSubarray4D mismatch 4D size.\n"; - exit(-1); - } - - using T = typename SubArrayType::DataType; - SIZE idx[4] = {0, 0, 0, 0}; - for (SIZE i = 0; i < subArray1.getShape(3); i++) { - idx[3] = i; - SubArrayType temp1 = subArray1; - SubArrayType temp2 = subArray2; - temp1.offset(3, i); - temp2.offset(3, i); - CompareSubarray("4D = " + std::to_string(i), temp1.Slice3D(0, 1, 2), - temp2.Slice3D(0, 1, 2)); - } -} - -template -void PrintSubarray4D(std::string name, SubArrayType subArray1) { - if (SubArrayType::NumDims != 4) { - std::cout << log::log_err << "PrintSubarray4D expects 4D subarray type.\n"; - exit(-1); - } - std::cout << name << "\n"; - using T = typename SubArrayType::DataType; - SIZE idx[4] = {0, 0, 0, 0}; - for (SIZE i = 0; i < subArray1.getShape(3); i++) { - idx[3] = i; - SubArrayType temp1 = subArray1; - temp1.offset(3, i); - PrintSubarray("i = " + std::to_string(i), temp1.Slice3D(0, 1, 2)); - } -} - -template -void calc_coeff_pointers( - Hierarchy &hierarchy, DIM curr_dims[3], DIM l, - SubArray doutput, SubArray &dcoarse, - SubArray &dcoeff_f, SubArray &dcoeff_c, - SubArray &dcoeff_r, SubArray &dcoeff_cf, - SubArray &dcoeff_rf, - SubArray &dcoeff_rc, - SubArray &dcoeff_rcf) { - - SIZE n[3]; - SIZE nn[3]; - for (DIM d = 0; d < 3; d++) { - n[d] = hierarchy.dofs[curr_dims[d]][l]; - nn[d] = hierarchy.dofs[curr_dims[d]][l + 1]; - } - - dcoarse = doutput; - dcoarse.resize(curr_dims[0], nn[0]); - dcoarse.resize(curr_dims[1], nn[1]); - dcoarse.resize(curr_dims[2], nn[2]); - - dcoeff_f = doutput; - dcoeff_f.offset(curr_dims[0], nn[0]); - dcoeff_f.resize(curr_dims[0], n[0] - nn[0]); - dcoeff_f.resize(curr_dims[1], nn[1]); - dcoeff_f.resize(curr_dims[2], nn[2]); - - dcoeff_c = doutput; - dcoeff_c.offset(curr_dims[1], nn[1]); - dcoeff_c.resize(curr_dims[0], nn[0]); - dcoeff_c.resize(curr_dims[1], n[1] - nn[1]); - dcoeff_c.resize(curr_dims[2], nn[2]); - - dcoeff_r = doutput; - dcoeff_r.offset(curr_dims[2], nn[2]); - dcoeff_r.resize(curr_dims[0], nn[0]); - dcoeff_r.resize(curr_dims[1], nn[1]); - dcoeff_r.resize(curr_dims[2], n[2] - nn[2]); - - dcoeff_cf = doutput; - dcoeff_cf.offset(curr_dims[0], nn[0]); - dcoeff_cf.offset(curr_dims[1], nn[1]); - dcoeff_cf.resize(curr_dims[0], n[0] - nn[0]); - dcoeff_cf.resize(curr_dims[1], n[1] - nn[1]); - dcoeff_cf.resize(curr_dims[2], nn[2]); - - dcoeff_rf = doutput; - dcoeff_rf.offset(curr_dims[0], nn[0]); - dcoeff_rf.offset(curr_dims[2], nn[2]); - dcoeff_rf.resize(curr_dims[0], n[0] - nn[0]); - dcoeff_rf.resize(curr_dims[1], nn[1]); - dcoeff_rf.resize(curr_dims[2], n[2] - nn[2]); - - dcoeff_rc = doutput; - dcoeff_rc.offset(curr_dims[1], nn[1]); - dcoeff_rc.offset(curr_dims[2], nn[2]); - dcoeff_rc.resize(curr_dims[0], nn[0]); - dcoeff_rc.resize(curr_dims[1], n[1] - nn[1]); - dcoeff_rc.resize(curr_dims[2], n[2] - nn[2]); - - dcoeff_rcf = doutput; - dcoeff_rcf.offset(curr_dims[0], nn[0]); - dcoeff_rcf.offset(curr_dims[1], nn[1]); - dcoeff_rcf.offset(curr_dims[2], nn[2]); - dcoeff_rcf.resize(curr_dims[0], n[0] - nn[0]); - dcoeff_rcf.resize(curr_dims[1], n[1] - nn[1]); - dcoeff_rcf.resize(curr_dims[2], n[2] - nn[2]); -} - -template -void calc_coefficients_3d(Hierarchy &hierarchy, - SubArray dinput, - SubArray &doutput, SIZE l, - int queue_idx) { - - int range_l = std::min(6, (int)std::log2(hierarchy.dofs[0][l]) - 1); - int range_lp1 = std::min(6, (int)std::log2(hierarchy.dofs[0][l + 1]) - 1); - - std::string prefix = "decomp_"; - if (sizeof(T) == sizeof(double)) - prefix += "d_"; - if (sizeof(T) == sizeof(float)) - prefix += "f_"; - for (int d = 0; d < D; d++) - prefix += std::to_string(hierarchy.shape[d]) + "_"; - - dinput.project(0, 1, 2); - doutput.project(0, 1, 2); - - SIZE f = hierarchy.dofs[0][l]; - SIZE c = hierarchy.dofs[1][l]; - SIZE r = hierarchy.dofs[2][l]; - SIZE ff = hierarchy.dofs[0][l + 1]; - SIZE cc = hierarchy.dofs[1][l + 1]; - SIZE rr = hierarchy.dofs[2][l + 1]; - - SubArray dcoarse = doutput; - dcoarse.resize({ff, cc, rr}); - SubArray dcoeff_f = doutput; - dcoeff_f.offset({ff, 0, 0}); - dcoeff_f.resize({f - ff, cc, rr}); - SubArray dcoeff_c = doutput; - dcoeff_c.offset({0, cc, 0}); - dcoeff_c.resize({ff, c - cc, rr}); - SubArray dcoeff_r = doutput; - dcoeff_r.offset({0, 0, rr}); - dcoeff_r.resize({ff, cc, r - rr}); - SubArray dcoeff_cf = doutput; - dcoeff_cf.offset({ff, cc, 0}); - dcoeff_cf.resize({f - ff, c - cc, rr}); - SubArray dcoeff_rf = doutput; - dcoeff_rf.offset({ff, 0, rr}); - dcoeff_rf.resize({f - ff, cc, r - rr}); - SubArray dcoeff_rc = doutput; - dcoeff_rc.offset({0, cc, rr}); - dcoeff_rc.resize({ff, c - cc, r - rr}); - SubArray dcoeff_rcf = doutput; - dcoeff_rcf.offset({ff, cc, rr}); - dcoeff_rcf.resize({f - ff, c - cc, r - rr}); - - // SubArray<1, T, DeviceType> ratio_r({hierarchy.dofs[2][l]}, - // hierarchy.ratio[2][l]); SubArray<1, T, DeviceType> - // ratio_c({hierarchy.dofs[1][l]}, hierarchy.ratio[1][l]); SubArray<1, T, - // DeviceType> ratio_f({hierarchy.dofs[0][l]}, hierarchy.ratio[0][l]); - - T *null = NULL; - GpkReo3D().Execute( - hierarchy.dofs[2][l], hierarchy.dofs[1][l], hierarchy.dofs[0][l], - hierarchy.dofs[2][l + 1], hierarchy.dofs[1][l + 1], - hierarchy.dofs[0][l + 1], SubArray(hierarchy.ratio_array[2][l]), - SubArray(hierarchy.ratio_array[1][l]), - SubArray(hierarchy.ratio_array[0][l]), - // ratio_r, ratio_c, ratio_f, - dinput, dcoarse, dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, - dcoeff_rc, dcoeff_rcf, queue_idx); - // hierarchy.sync_all(); - // if (debug_print) { - // PrintSubarray("after pi_Ql_reo", doutput); - // } - - // { - // std::vector shape2_rev(D); - // std::vector shape2_pad_rev(D); - // for (int i = 0; i < D; i++) { - // shape2_rev[i] = hierarchy.dofs[D-1-i][0]; - // shape2_pad_rev[i] = hierarchy.dofs[D-1-i][0] + 2; - // } - // mgard_cuda::Array input2(shape2_rev); - // mgard_cuda::Array work2(shape2_pad_rev); - - // MemoryManager::CopyND(input2.get_dv(), - // in_array2.get_ldvs_h()[0], - // dinput.data(), in_array.getLd(0), - // hierarchy.dofs[0][0], - // hierarchy.dofs[1][0] * - // hierarchy.linearized_depth, 0); - - // gpk_reo_3d( - // hierarchy, hierarchy.dofs[2][l], hierarchy.dofs[1][l], - // hierarchy.dofs[0][l], hierarchy.ratio[2][l], hierarchy.ratio[1][l], - // hierarchy.ratio[0][l], dinput.data(), dinput.getLddv1(), - // dinput.getLddv2(), dcoarse.data(), dcoarse.getLddv1(), - // dcoarse.getLddv2(), dcoeff_f.data(), dcoeff_f.getLddv1(), - // dcoeff_f.getLddv2(), dcoeff_c.data(), dcoeff_c.getLddv1(), - // dcoeff_c.getLddv2(), dcoeff_r.data(), dcoeff_r.getLddv1(), - // dcoeff_r.getLddv2(), dcoeff_cf.data(), dcoeff_cf.getLddv1(), - // dcoeff_cf.getLddv2(), dcoeff_rf.data(), dcoeff_rf.getLddv1(), - // dcoeff_rf.getLddv2(), dcoeff_rc.data(), dcoeff_rc.getLddv1(), - // dcoeff_rc.getLddv2(), dcoeff_rcf.data(), dcoeff_rcf.getLddv1(), - // dcoeff_rcf.getLddv2(), queue_idx, - // hierarchy.auto_tuning_cc[hierarchy.arch][hierarchy.precision][range_l]); - - // } - - verify_matrix_cuda( - hierarchy.dofs[2][l], hierarchy.dofs[1][l], hierarchy.dofs[0][l], - doutput.data(), doutput.getLd(0), doutput.getLd(1), doutput.getLd(0), - prefix + "gpk_reo_3d" + "_level_" + std::to_string(l), store, verify); - - if (debug_print) { - PrintSubarray("after pi_Ql_reo", doutput); - } -} - -template -void coefficients_restore_3d(Hierarchy &hierarchy, - SubArray dinput, - SubArray &doutput, SIZE l, - int queue_idx) { - - int range_l = std::min(6, (int)std::log2(hierarchy.dofs[0][l]) - 1); - int range_lp1 = std::min(6, (int)std::log2(hierarchy.dofs[0][l + 1]) - 1); - - std::string prefix = "decomp_"; - if (sizeof(T) == sizeof(double)) - prefix += "d_"; - if (sizeof(T) == sizeof(float)) - prefix += "f_"; - for (int d = 0; d < D; d++) - prefix += std::to_string(hierarchy.shape[d]) + "_"; - - dinput.project(0, 1, 2); - doutput.project(0, 1, 2); - - SIZE f = hierarchy.dofs[0][l]; - SIZE c = hierarchy.dofs[1][l]; - SIZE r = hierarchy.dofs[2][l]; - SIZE ff = hierarchy.dofs[0][l + 1]; - SIZE cc = hierarchy.dofs[1][l + 1]; - SIZE rr = hierarchy.dofs[2][l + 1]; - - SubArray dcoarse = dinput; - dcoarse.resize({ff, cc, rr}); - SubArray dcoeff_f = dinput; - dcoeff_f.offset({ff, 0, 0}); - dcoeff_f.resize({f - ff, cc, rr}); - SubArray dcoeff_c = dinput; - dcoeff_c.offset({0, cc, 0}); - dcoeff_c.resize({ff, c - cc, rr}); - SubArray dcoeff_r = dinput; - dcoeff_r.offset({0, 0, rr}); - dcoeff_r.resize({ff, cc, r - rr}); - SubArray dcoeff_cf = dinput; - dcoeff_cf.offset({ff, cc, 0}); - dcoeff_cf.resize({f - ff, c - cc, rr}); - SubArray dcoeff_rf = dinput; - dcoeff_rf.offset({ff, 0, rr}); - dcoeff_rf.resize({f - ff, cc, r - rr}); - SubArray dcoeff_rc = dinput; - dcoeff_rc.offset({0, cc, rr}); - dcoeff_rc.resize({ff, c - cc, r - rr}); - SubArray dcoeff_rcf = dinput; - dcoeff_rcf.offset({ff, cc, rr}); - dcoeff_rcf.resize({f - ff, c - cc, r - rr}); - - // SubArray<1, T, DeviceType> ratio_r({hierarchy.dofs[2][l]}, - // hierarchy.ratio[2][l]); SubArray<1, T, DeviceType> - // ratio_c({hierarchy.dofs[1][l]}, hierarchy.ratio[1][l]); SubArray<1, T, - // DeviceType> ratio_f({hierarchy.dofs[0][l]}, hierarchy.ratio[0][l]); - - GpkRev3D().Execute( - hierarchy.dofs[2][l], hierarchy.dofs[1][l], hierarchy.dofs[0][l], - hierarchy.dofs[2][l + 1], hierarchy.dofs[1][l + 1], - hierarchy.dofs[0][l + 1], SubArray(hierarchy.ratio_array[2][l]), - SubArray(hierarchy.ratio_array[1][l]), - SubArray(hierarchy.ratio_array[0][l]), - // ratio_r, ratio_c, ratio_f, - doutput, dcoarse, dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, - dcoeff_rc, dcoeff_rcf, 0, 0, 0, hierarchy.dofs[2][l], - hierarchy.dofs[1][l], hierarchy.dofs[0][l], queue_idx); - - T *null = NULL; - // gpk_rev_3d( - // hierarchy, hierarchy.dofs[2][l], hierarchy.dofs[1][l], - // hierarchy.dofs[0][l], hierarchy.ratio[2][l], hierarchy.ratio[1][l], - // hierarchy.ratio[0][l], doutput.data(), doutput.getLddv1(), - // doutput.getLddv2(), dcoarse.data(), dcoarse.getLddv1(), - // dcoarse.getLddv2(), - // // null, ldvs_h[0], ldvs_h[1], - // dcoeff_f.data(), dcoeff_f.getLddv1(), dcoeff_f.getLddv2(), - // // null, ldvs_h[0], ldvs_h[1], - // dcoeff_c.data(), dcoeff_c.getLddv1(), dcoeff_c.getLddv2(), - // // null, ldvs_h[0], ldvs_h[1], - // dcoeff_r.data(), dcoeff_r.getLddv1(), dcoeff_r.getLddv2(), - // // null, ldvs_h[0], ldvs_h[1], - // dcoeff_cf.data(), dcoeff_cf.getLddv1(), dcoeff_cf.getLddv2(), - // // null, ldvs_h[0], ldvs_h[1], - // dcoeff_rf.data(), dcoeff_rf.getLddv1(), dcoeff_rf.getLddv2(), - // // null, ldvs_h[0], ldvs_h[1], - // dcoeff_rc.data(), dcoeff_rc.getLddv1(), dcoeff_rc.getLddv2(), - // // null, ldvs_h[0], ldvs_h[1], - // dcoeff_rcf.data(), dcoeff_rcf.getLddv1(), dcoeff_rcf.getLddv2(), - // // null, ldvs_h[0], ldvs_h[1], - // 0, 0, 0, hierarchy.dofs[2][l], hierarchy.dofs[1][l], - // hierarchy.dofs[0][l], queue_idx, - // hierarchy.auto_tuning_cc[hierarchy.arch][hierarchy.precision][range_l]); - - // hierarchy.sync(0); - verify_matrix_cuda( - hierarchy.dofs[2][l], hierarchy.dofs[1][l], hierarchy.dofs[0][l], - doutput.data(), doutput.getLd(0), doutput.getLd(1), doutput.getLd(0), - prefix + "gpk_rev_3d" + "_level_" + std::to_string(l), store, verify); - - // gpk_rev(hierarchy, - // shape, shape_c, hierarchy.ldws_h, ldvs_h, unprocessed_dims, - // 2, 1, 0, - // hierarchy.ratio[2][l], hierarchy.ratio[1][l], - // hierarchy.ratio[0][l], hierarchy.dw, hierarchy.ldws_h[0], - // hierarchy.ldws_h[1], dv, ldvs_h[0], ldvs_h[1], - // dv+get_idx(ldvs_h[0], ldvs_h[1], 0, 0, - // hierarchy.dofs[0][l+1]), ldvs_h[0], ldvs_h[1], - // // null, ldvs_h[0], ldvs_h[1], - // dv+get_idx(ldvs_h[0], ldvs_h[1], 0, hierarchy.dofs[1][l+1], - // 0), ldvs_h[0], ldvs_h[1], - // // null, ldvs_h[0], ldvs_h[1], - // dv+get_idx(ldvs_h[0], ldvs_h[1], hierarchy.dofs[2][l+1], 0, - // 0), ldvs_h[0], ldvs_h[1], - // // null, ldvs_h[0], ldvs_h[1], - // dv+get_idx(ldvs_h[0], ldvs_h[1], 0, hierarchy.dofs[1][l+1], - // hierarchy.dofs[0][l+1]), ldvs_h[0], ldvs_h[1], - // // null, ldvs_h[0], ldvs_h[1], - // dv+get_idx(ldvs_h[0], ldvs_h[1], hierarchy.dofs[2][l+1], 0, - // hierarchy.dofs[0][l+1]), ldvs_h[0], ldvs_h[1], - // // null, ldvs_h[0], ldvs_h[1], - // dv+get_idx(ldvs_h[0], ldvs_h[1], hierarchy.dofs[2][l+1], - // hierarchy.dofs[1][l+1], 0), ldvs_h[0], ldvs_h[1], - // // null,ldvs_h[0], ldvs_h[1], - // dv+get_idx(ldvs_h[0], ldvs_h[1], hierarchy.dofs[2][l+1], - // hierarchy.dofs[1][l+1], hierarchy.dofs[0][l+1]), ldvs_h[0], - // ldvs_h[1], - // // null, ldvs_h[0], ldvs_h[1], - // 0, 0, 0, hierarchy.dofs[2][l], hierarchy.dofs[1][l], - // hierarchy.dofs[0][l], 0, - // hierarchy.auto_tuning_cc[hierarchy.arch][hierarchy.precision][range_l]); - - // print_matrix_cuda(hierarchy.dofs[2][l], hierarchy.dofs[1][l], - // hierarchy.dofs[0][l], doutput.data(), doutput.ldvs_h[0], doutput.ldvs_h[1], - // doutput.ldvs_h[0],); - - // gpk_rev(hierarchy, - // shape, shape_c, hierarchy.ldws_h, ldvs_h, unprocessed_dims, - // 2, 1, 0, - // hierarchy.ratio[2][l], hierarchy.ratio[1][l], - // hierarchy.ratio[0][l], hierarchy.dw, hierarchy.ldws_h[0], - // hierarchy.ldws_h[1], dv, ldvs_h[0], ldvs_h[1], - // dv+get_idx(ldvs_h[0], ldvs_h[1], 0, 0, - // hierarchy.dofs[0][l+1]), ldvs_h[0], ldvs_h[1], - // // null, ldvs_h[0], ldvs_h[1], - // dv+get_idx(ldvs_h[0], ldvs_h[1], 0, hierarchy.dofs[1][l+1], - // 0), ldvs_h[0], ldvs_h[1], - // // null, ldvs_h[0], ldvs_h[1], - // dv+get_idx(ldvs_h[0], ldvs_h[1], hierarchy.dofs[2][l+1], 0, - // 0), ldvs_h[0], ldvs_h[1], - // // null, ldvs_h[0], ldvs_h[1], - // dv+get_idx(ldvs_h[0], ldvs_h[1], 0, hierarchy.dofs[1][l+1], - // hierarchy.dofs[0][l+1]), ldvs_h[0], ldvs_h[1], - // // null, ldvs_h[0], ldvs_h[1], - // dv+get_idx(ldvs_h[0], ldvs_h[1], hierarchy.dofs[2][l+1], 0, - // hierarchy.dofs[0][l+1]), ldvs_h[0], ldvs_h[1], - // // null, ldvs_h[0], ldvs_h[1], - // dv+get_idx(ldvs_h[0], ldvs_h[1], hierarchy.dofs[2][l+1], - // hierarchy.dofs[1][l+1], 0), ldvs_h[0], ldvs_h[1], - // // null,ldvs_h[0], ldvs_h[1], - // dv+get_idx(ldvs_h[0], ldvs_h[1], hierarchy.dofs[2][l+1], - // hierarchy.dofs[1][l+1], hierarchy.dofs[0][l+1]), ldvs_h[0], - // ldvs_h[1], - // // null, ldvs_h[0], ldvs_h[1], - // 0, 0, 0, hierarchy.dofs[2][l], hierarchy.dofs[1][l], - // hierarchy.dofs[0][l], 0, - // hierarchy.auto_tuning_cc[hierarchy.arch][hierarchy.precision][range_l]); - - if (debug_print) { - PrintSubarray("after coeff-restore", doutput); - } -} - -template -void calc_correction_3d(Hierarchy &hierarchy, - SubArray dcoeff, - SubArray &dcorrection, SIZE l, - int queue_idx) { - - int range_l = std::min(6, (int)std::log2(hierarchy.dofs[0][l]) - 1); - int range_lp1 = std::min(6, (int)std::log2(hierarchy.dofs[0][l + 1]) - 1); - - std::string prefix = "decomp_"; - if (sizeof(T) == sizeof(double)) - prefix += "d_"; - if (sizeof(T) == sizeof(float)) - prefix += "f_"; - for (int d = 0; d < D; d++) - prefix += std::to_string(hierarchy.shape[d]) + "_"; - - SubArray dw_in1, dw_in2, dw_out; - - if (D >= 1) { - dw_in1 = dcoeff; - dw_in1.resize( - {hierarchy.dofs[0][l + 1], hierarchy.dofs[1][l], hierarchy.dofs[2][l]}); - dw_in2 = dcoeff; - dw_in2.offset({hierarchy.dofs[0][l + 1], 0, 0}); - dw_in2.resize({hierarchy.dofs[0][l] - hierarchy.dofs[0][l + 1], - hierarchy.dofs[1][l], hierarchy.dofs[2][l]}); - dw_out = dcorrection; - dw_out.resize( - {hierarchy.dofs[0][l + 1], hierarchy.dofs[1][l], hierarchy.dofs[2][l]}); - - Lpk1Reo3D().Execute( - hierarchy.dofs[2][l], hierarchy.dofs[1][l], hierarchy.dofs[0][l], - hierarchy.dofs[0][l + 1], hierarchy.dofs[2][l + 1], - hierarchy.dofs[1][l + 1], hierarchy.dofs[0][l + 1], - SubArray(hierarchy.dist_array[0][l]), - SubArray(hierarchy.ratio_array[0][l]), dw_in1, dw_in2, dw_out, - queue_idx); - - verify_matrix_cuda( - hierarchy.dofs[2][l], hierarchy.dofs[1][l], hierarchy.dofs[0][l + 1], - dw_out.data(), dw_out.getLd(0), dw_out.getLd(1), dw_out.getLd(0), - prefix + "lpk_reo_1_3d" + "_level_" + std::to_string(l), store, verify); - - if (debug_print) { - PrintSubarray("after mass_trans_multiply_1_cpt", dw_out); - } - } - - if (D >= 2) { - dw_in1 = dw_out; - dw_in1.resize({hierarchy.dofs[0][l + 1], hierarchy.dofs[1][l + 1], - hierarchy.dofs[2][l]}); - dw_in2 = dw_out; - dw_in2.offset({0, hierarchy.dofs[1][l + 1], 0}); - dw_in2.resize({hierarchy.dofs[0][l + 1], - hierarchy.dofs[1][l] - hierarchy.dofs[1][l + 1], - hierarchy.dofs[2][l]}); - dw_out.offset({hierarchy.dofs[0][l + 1], 0, 0}); - dw_out.resize({hierarchy.dofs[0][l + 1], hierarchy.dofs[1][l + 1], - hierarchy.dofs[2][l]}); - - Lpk2Reo3D().Execute( - hierarchy.dofs[2][l], hierarchy.dofs[1][l], hierarchy.dofs[0][l + 1], - hierarchy.dofs[1][l + 1], SubArray(hierarchy.dist_array[1][l]), - SubArray(hierarchy.ratio_array[1][l]), dw_in1, dw_in2, dw_out, - queue_idx); - - verify_matrix_cuda(hierarchy.dofs[2][l], hierarchy.dofs[1][l + 1], - hierarchy.dofs[0][l + 1], dw_out.data(), dw_out.getLd(0), - dw_out.getLd(1), dw_out.getLd(0), - prefix + "lpk_reo_2_3d" + "_level_" + std::to_string(l), - store, verify); - - if (debug_print) { - PrintSubarray("after mass_trans_multiply_2_cpt", dw_out); - } - } - - if (D == 3) { - dw_in1 = dw_out; - dw_in1.resize({hierarchy.dofs[0][l + 1], hierarchy.dofs[1][l + 1], - hierarchy.dofs[2][l + 1]}); - dw_in2 = dw_out; - dw_in2.offset({0, 0, hierarchy.dofs[2][l + 1]}); - dw_in2.resize({hierarchy.dofs[0][l + 1], hierarchy.dofs[1][l + 1], - hierarchy.dofs[2][l] - hierarchy.dofs[2][l + 1]}); - dw_out.offset({hierarchy.dofs[0][l + 1], hierarchy.dofs[1][l + 1], 0}); - dw_out.resize({hierarchy.dofs[0][l + 1], hierarchy.dofs[1][l + 1], - hierarchy.dofs[2][l + 1]}); - - Lpk3Reo3D().Execute( - hierarchy.dofs[2][l], hierarchy.dofs[1][l + 1], - hierarchy.dofs[0][l + 1], hierarchy.dofs[2][l + 1], - SubArray(hierarchy.dist_array[2][l]), - SubArray(hierarchy.ratio_array[2][l]), dw_in1, dw_in2, dw_out, - queue_idx); - - verify_matrix_cuda(hierarchy.dofs[2][l + 1], hierarchy.dofs[1][l + 1], - hierarchy.dofs[0][l + 1], dw_out.data(), dw_out.getLd(0), - dw_out.getLd(1), dw_out.getLd(0), - prefix + "lpk_reo_3_3d" + "_level_" + std::to_string(l), - store, verify); - - if (debug_print) { - PrintSubarray("after mass_trans_multiply_3_cpt", dw_out); - } - } - - if (D >= 1) { - Ipk1Reo3D().Execute( - hierarchy.dofs[2][l + 1], hierarchy.dofs[1][l + 1], - hierarchy.dofs[0][l + 1], SubArray(hierarchy.am_array[0][l + 1]), - SubArray(hierarchy.bm_array[0][l + 1]), - SubArray(hierarchy.dist_array[0][l + 1]), dw_out, queue_idx); - verify_matrix_cuda(hierarchy.dofs[2][l + 1], hierarchy.dofs[1][l + 1], - hierarchy.dofs[0][l + 1], dw_out.data(), dw_out.getLd(0), - dw_out.getLd(1), dw_out.getLd(0), - prefix + "ipk_1_3d" + "_level_" + std::to_string(l), - store, verify); - - if (debug_print) { - PrintSubarray("after solve_tridiag_1_cpt", dw_out); - } - } - if (D >= 2) { - Ipk2Reo3D().Execute( - hierarchy.dofs[2][l + 1], hierarchy.dofs[1][l + 1], - hierarchy.dofs[0][l + 1], SubArray(hierarchy.am_array[1][l + 1]), - SubArray(hierarchy.bm_array[1][l + 1]), - SubArray(hierarchy.dist_array[1][l + 1]), dw_out, queue_idx); - - verify_matrix_cuda(hierarchy.dofs[2][l + 1], hierarchy.dofs[1][l + 1], - hierarchy.dofs[0][l + 1], dw_out.data(), dw_out.getLd(0), - dw_out.getLd(1), dw_out.getLd(0), - prefix + "ipk_2_3d" + "_level_" + std::to_string(l), - store, verify); - - if (debug_print) { - PrintSubarray("after solve_tridiag_2_cpt", dw_out); - } - } - if (D == 3) { - Ipk3Reo3D().Execute( - hierarchy.dofs[2][l + 1], hierarchy.dofs[1][l + 1], - hierarchy.dofs[0][l + 1], SubArray(hierarchy.am_array[2][l + 1]), - SubArray(hierarchy.bm_array[2][l + 1]), - SubArray(hierarchy.dist_array[2][l + 1]), dw_out, queue_idx); - - verify_matrix_cuda(hierarchy.dofs[2][l + 1], hierarchy.dofs[1][l + 1], - hierarchy.dofs[0][l + 1], dw_out.data(), dw_out.getLd(0), - dw_out.getLd(1), dw_out.getLd(0), - prefix + "ipk_3_3d" + "_level_" + std::to_string(l), - store, verify); - - if (debug_print) { - PrintSubarray("after solve_tridiag_3_cpt", dw_out); - } - } - // final correction output - dcorrection = dw_out; -} - -template -void calc_coefficients_nd(Hierarchy &hierarchy, - SubArray dinput1, - SubArray dinput2, - SubArray &doutput, SIZE l, - int queue_idx) { - - int range_l = std::min(6, (int)std::log2(hierarchy.dofs[0][l]) - 1); - int range_lp1 = std::min(6, (int)std::log2(hierarchy.dofs[0][l + 1]) - 1); - - std::string prefix = "decomp_"; - if (sizeof(T) == sizeof(double)) - prefix += "d_"; - if (sizeof(T) == sizeof(float)) - prefix += "f_"; - for (int d = 0; d < D; d++) - prefix += std::to_string(hierarchy.shape[d]) + "_"; - // printf("interpolate 1-3D\n"); - - SubArray dcoarse, dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, - dcoeff_rf, dcoeff_rc, dcoeff_rcf; - - DIM curr_dims[3]; - - int unprocessed_idx = 0; - curr_dims[0] = 0; - curr_dims[1] = 1; - curr_dims[2] = 2; - dinput1.project(curr_dims[0], curr_dims[1], curr_dims[2]); - doutput.project(curr_dims[0], curr_dims[1], curr_dims[2]); - - calc_coeff_pointers(hierarchy, curr_dims, l, doutput, dcoarse, dcoeff_f, - dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc, - dcoeff_rcf); - - // gpuErrchk(cudaDeviceSynchronize()); - GpkReo().Execute( - SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true), - SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true), - hierarchy.unprocessed_n[unprocessed_idx], - // unprocessed_dims_subarray, - SubArray(hierarchy.unprocessed_dims[unprocessed_idx]), curr_dims[2], - curr_dims[1], curr_dims[0], - SubArray(hierarchy.ratio_array[curr_dims[2]][l]), - SubArray(hierarchy.ratio_array[curr_dims[1]][l]), - SubArray(hierarchy.ratio_array[curr_dims[0]][l]), - // ratio_r, ratio_c, ratio_f, - dinput1, dcoarse, dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, - dcoeff_rc, dcoeff_rcf, queue_idx); - // gpuErrchk(cudaDeviceSynchronize()); - - for (DIM d = 3; d < D; d += 2) { - // copy back to input1 for interpolation again - LwpkReo().Execute(doutput, dinput1, queue_idx); - - // printf("interpolate %u-%uD\n", d+1, d+2); - curr_dims[0] = 0; - curr_dims[1] = d; - curr_dims[2] = d + 1; - dinput1.project(curr_dims[0], curr_dims[1], curr_dims[2]); - doutput.project(curr_dims[0], curr_dims[1], curr_dims[2]); - calc_coeff_pointers(hierarchy, curr_dims, l, doutput, dcoarse, dcoeff_f, - dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc, - dcoeff_rcf); - - if (D - d == 1) { - unprocessed_idx += 1; - - GpkReo().Execute( - SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true), - SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true), - hierarchy.unprocessed_n[unprocessed_idx], - SubArray(hierarchy.unprocessed_dims[unprocessed_idx]), curr_dims[2], - curr_dims[1], curr_dims[0], - SubArray(hierarchy.ratio_array[curr_dims[2]][l]), - SubArray(hierarchy.ratio_array[curr_dims[1]][l]), - SubArray(hierarchy.ratio_array[curr_dims[0]][l]), dinput1, dcoarse, - dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc, - dcoeff_rcf, queue_idx); - - } else { // D - d >= 2 - unprocessed_idx += 2; - GpkReo().Execute( - SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true), - SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true), - hierarchy.unprocessed_n[unprocessed_idx], - SubArray(hierarchy.unprocessed_dims[unprocessed_idx]), - // unprocessed_dims_subarray, - curr_dims[2], curr_dims[1], curr_dims[0], - // ratio_r, ratio_c, ratio_f, - SubArray(hierarchy.ratio_array[curr_dims[2]][l]), - SubArray(hierarchy.ratio_array[curr_dims[1]][l]), - SubArray(hierarchy.ratio_array[curr_dims[0]][l]), dinput1, dcoarse, - dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc, - dcoeff_rcf, queue_idx); - } - } - - if (debug_print) { // debug - PrintSubarray4D("after interpolation", doutput); - } // debug - - unprocessed_idx = 0; - // printf("reorder 1-3D\n"); - curr_dims[0] = 0; - curr_dims[1] = 1; - curr_dims[2] = 2; - dinput2.project(curr_dims[0], curr_dims[1], curr_dims[2]); - dinput1.project(curr_dims[0], curr_dims[1], - curr_dims[2]); // reuse input1 as temp output - - calc_coeff_pointers(hierarchy, curr_dims, l, dinput1, dcoarse, dcoeff_f, - dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc, - dcoeff_rcf); - - GpkReo().Execute( - SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true), - SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true), - hierarchy.unprocessed_n[unprocessed_idx], - SubArray(hierarchy.unprocessed_dims[unprocessed_idx]), - // unprocessed_dims_subarray, - curr_dims[2], curr_dims[1], curr_dims[0], - // ratio_r, ratio_c, ratio_f, - SubArray(hierarchy.ratio_array[curr_dims[2]][l]), - SubArray(hierarchy.ratio_array[curr_dims[1]][l]), - SubArray(hierarchy.ratio_array[curr_dims[0]][l]), dinput2, dcoarse, - dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc, dcoeff_rcf, - queue_idx); - - DIM D_reduced = D % 2 == 0 ? D - 1 : D - 2; - for (DIM d = 3; d < D_reduced; d += 2) { - // copy back to input2 for reordering again - - LwpkReo().Execute(dinput1, dinput2, queue_idx); - - unprocessed_idx += 2; - // printf("reorder %u-%uD\n", d+1, d+2); - curr_dims[0] = 0; - curr_dims[1] = d; - curr_dims[2] = d + 1; - dinput2.project(curr_dims[0], curr_dims[1], curr_dims[2]); - dinput1.project(curr_dims[0], curr_dims[1], - curr_dims[2]); // reuse input1 as temp output - - calc_coeff_pointers(hierarchy, curr_dims, l, dinput1, dcoarse, dcoeff_f, - dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc, - dcoeff_rcf); - - GpkReo().Execute( - SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true), - SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true), - hierarchy.unprocessed_n[unprocessed_idx], - SubArray(hierarchy.unprocessed_dims[unprocessed_idx]), - // unprocessed_dims_subarray, - curr_dims[2], curr_dims[1], curr_dims[0], - // ratio_r, ratio_c, ratio_f, - SubArray(hierarchy.ratio_array[curr_dims[2]][l]), - SubArray(hierarchy.ratio_array[curr_dims[1]][l]), - SubArray(hierarchy.ratio_array[curr_dims[0]][l]), dinput2, dcoarse, - dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc, - dcoeff_rcf, queue_idx); - // gpuErrchk(cudaDeviceSynchronize()); - } - - // printf("calc coeff %u-%dD\n", D_reduced+1, D_reduced+2); - curr_dims[0] = 0; - curr_dims[1] = D_reduced; - curr_dims[2] = D_reduced + 1; - dinput1.project(curr_dims[0], curr_dims[1], curr_dims[2]); - doutput.project(curr_dims[0], curr_dims[1], - curr_dims[2]); // reuse input1 as temp output - calc_coeff_pointers(hierarchy, curr_dims, l, doutput, dcoarse, dcoeff_f, - dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc, - dcoeff_rcf); - if (D - D_reduced == 1) { - unprocessed_idx += 1; - GpkReo().Execute( - SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true), - SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true), - hierarchy.unprocessed_n[unprocessed_idx], - SubArray(hierarchy.unprocessed_dims[unprocessed_idx]), - // unprocessed_dims_subarray, - curr_dims[2], curr_dims[1], curr_dims[0], - // ratio_r, ratio_c, ratio_f, - SubArray(hierarchy.ratio_array[curr_dims[2]][l]), - SubArray(hierarchy.ratio_array[curr_dims[1]][l]), - SubArray(hierarchy.ratio_array[curr_dims[0]][l]), dinput1, dcoarse, - dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc, - dcoeff_rcf, queue_idx); - // gpuErrchk(cudaDeviceSynchronize()); - - } else { // D-D_reduced == 2 - unprocessed_idx += 2; - - GpkReo().Execute( - SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true), - SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true), - hierarchy.unprocessed_n[unprocessed_idx], - SubArray(hierarchy.unprocessed_dims[unprocessed_idx]), curr_dims[2], - curr_dims[1], curr_dims[0], - SubArray(hierarchy.ratio_array[curr_dims[2]][l]), - SubArray(hierarchy.ratio_array[curr_dims[1]][l]), - SubArray(hierarchy.ratio_array[curr_dims[0]][l]), dinput1, dcoarse, - dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc, - dcoeff_rcf, queue_idx); - } - - if (debug_print) { // debug - PrintSubarray4D("after calc coeff", doutput); - } // debug -} - -template -void coefficients_restore_nd(Hierarchy &hierarchy, - SubArray dinput1, - SubArray dinput2, - SubArray &doutput, SIZE l, - int queue_idx) { - - int range_l = std::min(6, (int)std::log2(hierarchy.dofs[0][l]) - 1); - int range_lp1 = std::min(6, (int)std::log2(hierarchy.dofs[0][l + 1]) - 1); - - std::string prefix = "decomp_"; - if (sizeof(T) == sizeof(double)) - prefix += "d_"; - if (sizeof(T) == sizeof(float)) - prefix += "f_"; - for (int d = 0; d < D; d++) - prefix += std::to_string(hierarchy.shape[d]) + "_"; - - SubArray dcoarse, dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, - dcoeff_rf, dcoeff_rc, dcoeff_rcf; - - DIM curr_dims[3]; - int unprocessed_idx = 0; - - // printf("interpolate-restore 1-3D\n"); - curr_dims[0] = 0; - curr_dims[1] = 1; - curr_dims[2] = 2; - dinput1.project(curr_dims[0], curr_dims[1], curr_dims[2]); - doutput.project(curr_dims[0], curr_dims[1], curr_dims[2]); - - calc_coeff_pointers(hierarchy, curr_dims, l, dinput1, dcoarse, dcoeff_f, - dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc, - dcoeff_rcf); - - // gpk_rev( - // hierarchy, hierarchy.shapes_h[l], hierarchy.shapes_d[l], - // hierarchy.shapes_d[l + 1], doutput.getLdd(), dinput1.getLdd(), - // hierarchy.unprocessed_n[unprocessed_idx], - // hierarchy.unprocessed_dims_d[unprocessed_idx], - // curr_dims[2], curr_dims[1], curr_dims[0], - // hierarchy.ratio[curr_dims[2]][l], hierarchy.ratio[curr_dims[1]][l], - // hierarchy.ratio[curr_dims[0]][l], doutput.data(), doutput.getLddv1(), - // doutput.getLddv2(), dcoarse.data(), dcoarse.getLddv1(), - // dcoarse.getLddv2(), - // // null, lddv1, lddv2, - // dcoeff_f.data(), dcoeff_f.getLddv1(), dcoeff_f.getLddv2(), - // // null, lddv1, lddv2, - // dcoeff_c.data(), dcoeff_c.getLddv1(), dcoeff_c.getLddv2(), - // // null, lddv1, lddv2, - // dcoeff_r.data(), dcoeff_r.getLddv1(), dcoeff_r.getLddv2(), - // // null, lddv1, lddv2, - // dcoeff_cf.data(), dcoeff_cf.getLddv1(), dcoeff_cf.getLddv2(), - // // null, lddv1, lddv2, - // dcoeff_rf.data(), dcoeff_rf.getLddv1(), dcoeff_rf.getLddv2(), - // // null, lddv1, lddv2, - // dcoeff_rc.data(), dcoeff_rc.getLddv1(), dcoeff_rc.getLddv2(), - // // null, lddv1, lddv2, - // dcoeff_rcf.data(), dcoeff_rcf.getLddv1(), dcoeff_rcf.getLddv2(), - // // null, lddv1, lddv2, - // 0, 0, 0, - // hierarchy.dofs[curr_dims[2]][l], hierarchy.dofs[curr_dims[1]][l], - // hierarchy.dofs[curr_dims[0]][l], queue_idx, - // hierarchy.auto_tuning_cc[hierarchy.arch][hierarchy.precision][range_l]); - - // gpuErrchk(cudaDeviceSynchronize()); - GpkRev().Execute( - SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true), - SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true), - hierarchy.unprocessed_n[unprocessed_idx], - SubArray(hierarchy.unprocessed_dims[unprocessed_idx]), - // unprocessed_dims_subarray, - curr_dims[2], curr_dims[1], curr_dims[0], - // ratio_r, ratio_c, ratio_f, - SubArray(hierarchy.ratio_array[curr_dims[2]][l]), - SubArray(hierarchy.ratio_array[curr_dims[1]][l]), - SubArray(hierarchy.ratio_array[curr_dims[0]][l]), doutput, dcoarse, - dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc, dcoeff_rcf, - 0, 0, 0, hierarchy.dofs[curr_dims[2]][l], hierarchy.dofs[curr_dims[1]][l], - hierarchy.dofs[curr_dims[0]][l], queue_idx); - // gpuErrchk(cudaDeviceSynchronize()); - - for (DIM d = 3; d < D; d += 2) { - // lwpk(hierarchy, hierarchy.shapes_h[l], hierarchy.shapes_d[l], - // doutput.data(), doutput.getLdd(), - // dinput1.data(), dinput1.getLdd(), queue_idx); - - // gpuErrchk(cudaDeviceSynchronize()); - LwpkReo().Execute(doutput, dinput1, queue_idx); - // gpuErrchk(cudaDeviceSynchronize()); - - // printf("interpolate-restore %u-%uD\n", d+1, d+2); - curr_dims[0] = 0; - curr_dims[1] = d; - curr_dims[2] = d + 1; - dinput1.project(curr_dims[0], curr_dims[1], curr_dims[2]); - doutput.project(curr_dims[0], curr_dims[1], curr_dims[2]); - calc_coeff_pointers(hierarchy, curr_dims, l, dinput1, dcoarse, dcoeff_f, - dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc, - dcoeff_rcf); - - if (D - d == 1) { - unprocessed_idx += 1; - // unprocessed_dims_subarray = SubArray<1, DIM, - // DeviceType>({(SIZE)hierarchy.unprocessed_n[unprocessed_idx]}, - // hierarchy.unprocessed_dims_d[unprocessed_idx]); - // ratio_r = SubArray<1, T, DeviceType>({hierarchy.dofs[curr_dims[2]][l]}, - // hierarchy.ratio[curr_dims[2]][l]); ratio_c = SubArray<1, T, - // DeviceType>({hierarchy.dofs[curr_dims[1]][l]}, - // hierarchy.ratio[curr_dims[1]][l]); ratio_f = SubArray<1, T, - // DeviceType>({hierarchy.dofs[curr_dims[0]][l]}, - // hierarchy.ratio[curr_dims[0]][l]); - - // gpk_rev( - // hierarchy, hierarchy.shapes_h[l], hierarchy.shapes_d[l], - // hierarchy.shapes_d[l + 1], doutput.getLdd(), dinput1.getLdd(), - // hierarchy.unprocessed_n[unprocessed_idx], - // hierarchy.unprocessed_dims_d[unprocessed_idx], - // curr_dims[2], curr_dims[1], curr_dims[0], - // hierarchy.ratio[curr_dims[2]][l], hierarchy.ratio[curr_dims[1]][l], - // hierarchy.ratio[curr_dims[0]][l], doutput.data(), - // doutput.getLddv1(), doutput.getLddv2(), dcoarse.data(), - // dcoarse.getLddv1(), dcoarse.getLddv2(), - // // null, lddv1, lddv2, - // dcoeff_f.data(), dcoeff_f.getLddv1(), dcoeff_f.getLddv2(), - // // null, lddv1, lddv2, - // dcoeff_c.data(), dcoeff_c.getLddv1(), dcoeff_c.getLddv2(), - // // null, lddv1, lddv2, - // dcoeff_r.data(), dcoeff_r.getLddv1(), dcoeff_r.getLddv2(), - // // null, lddv1, lddv2, - // dcoeff_cf.data(), dcoeff_cf.getLddv1(), dcoeff_cf.getLddv2(), - // // null, lddv1, lddv2, - // dcoeff_rf.data(), dcoeff_rf.getLddv1(), dcoeff_rf.getLddv2(), - // // null, lddv1, lddv2, - // dcoeff_rc.data(), dcoeff_rc.getLddv1(), dcoeff_rc.getLddv2(), - // // null, lddv1, lddv2, - // dcoeff_rcf.data(), dcoeff_rcf.getLddv1(), dcoeff_rcf.getLddv2(), - // // null, lddv1, lddv2, - // 0, 0, 0, - // hierarchy.dofs[curr_dims[2]][l], hierarchy.dofs[curr_dims[1]][l], - // hierarchy.dofs[curr_dims[0]][l], queue_idx, - // hierarchy.auto_tuning_cc[hierarchy.arch][hierarchy.precision][range_l]); - - // gpuErrchk(cudaDeviceSynchronize()); - GpkRev().Execute( - SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true), - SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true), - hierarchy.unprocessed_n[unprocessed_idx], - SubArray(hierarchy.unprocessed_dims[unprocessed_idx]), - // unprocessed_dims_subarray, - curr_dims[2], curr_dims[1], curr_dims[0], - // ratio_r, ratio_c, ratio_f, - SubArray(hierarchy.ratio_array[curr_dims[2]][l]), - SubArray(hierarchy.ratio_array[curr_dims[1]][l]), - SubArray(hierarchy.ratio_array[curr_dims[0]][l]), doutput, dcoarse, - dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc, - dcoeff_rcf, 0, 0, 0, hierarchy.dofs[curr_dims[2]][l], - hierarchy.dofs[curr_dims[1]][l], hierarchy.dofs[curr_dims[0]][l], - queue_idx); - // gpuErrchk(cudaDeviceSynchronize()); - - } else { // D - d >= 2 - unprocessed_idx += 2; - // unprocessed_dims_subarray = SubArray<1, DIM, - // DeviceType>({(SIZE)hierarchy.unprocessed_n[unprocessed_idx]}, - // hierarchy.unprocessed_dims_d[unprocessed_idx]); - // ratio_r = SubArray<1, T, DeviceType>({hierarchy.dofs[curr_dims[2]][l]}, - // hierarchy.ratio[curr_dims[2]][l]); ratio_c = SubArray<1, T, - // DeviceType>({hierarchy.dofs[curr_dims[1]][l]}, - // hierarchy.ratio[curr_dims[1]][l]); ratio_f = SubArray<1, T, - // DeviceType>({hierarchy.dofs[curr_dims[0]][l]}, - // hierarchy.ratio[curr_dims[0]][l]); - - // gpk_rev( - // hierarchy, hierarchy.shapes_h[l], hierarchy.shapes_d[l], - // hierarchy.shapes_d[l + 1], doutput.getLdd(), dinput1.getLdd(), - // hierarchy.unprocessed_n[unprocessed_idx], - // hierarchy.unprocessed_dims_d[unprocessed_idx], - // curr_dims[2], curr_dims[1], curr_dims[0], - // hierarchy.ratio[curr_dims[2]][l], hierarchy.ratio[curr_dims[1]][l], - // hierarchy.ratio[curr_dims[0]][l], doutput.data(), - // doutput.getLddv1(), doutput.getLddv2(), dcoarse.data(), - // dcoarse.getLddv1(), dcoarse.getLddv2(), - // // null, lddv1, lddv2, - // dcoeff_f.data(), dcoeff_f.getLddv1(), dcoeff_f.getLddv2(), - // // null, lddv1, lddv2, - // dcoeff_c.data(), dcoeff_c.getLddv1(), dcoeff_c.getLddv2(), - // // null, lddv1, lddv2, - // dcoeff_r.data(), dcoeff_r.getLddv1(), dcoeff_r.getLddv2(), - // // null, lddv1, lddv2, - // dcoeff_cf.data(), dcoeff_cf.getLddv1(), dcoeff_cf.getLddv2(), - // // null, lddv1, lddv2, - // dcoeff_rf.data(), dcoeff_rf.getLddv1(), dcoeff_rf.getLddv2(), - // // null, lddv1, lddv2, - // dcoeff_rc.data(), dcoeff_rc.getLddv1(), dcoeff_rc.getLddv2(), - // // null, lddv1, lddv2, - // dcoeff_rcf.data(), dcoeff_rcf.getLddv1(), dcoeff_rcf.getLddv2(), - // // null, lddv1, lddv2, - // 0, 0, 0, - // hierarchy.dofs[curr_dims[2]][l], hierarchy.dofs[curr_dims[1]][l], - // hierarchy.dofs[curr_dims[0]][l], queue_idx, - // hierarchy.auto_tuning_cc[hierarchy.arch][hierarchy.precision][range_l]); - - // gpuErrchk(cudaDeviceSynchronize()); - GpkRev().Execute( - SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true), - SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true), - hierarchy.unprocessed_n[unprocessed_idx], - SubArray(hierarchy.unprocessed_dims[unprocessed_idx]), - // unprocessed_dims_subarray, - curr_dims[2], curr_dims[1], curr_dims[0], - // ratio_r, ratio_c, ratio_f, - SubArray(hierarchy.ratio_array[curr_dims[2]][l]), - SubArray(hierarchy.ratio_array[curr_dims[1]][l]), - SubArray(hierarchy.ratio_array[curr_dims[0]][l]), doutput, dcoarse, - dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc, - dcoeff_rcf, 0, 0, 0, hierarchy.dofs[curr_dims[2]][l], - hierarchy.dofs[curr_dims[1]][l], hierarchy.dofs[curr_dims[0]][l], - queue_idx); - // gpuErrchk(cudaDeviceSynchronize()); - } - } - // Done interpolation-restore on doutput - - if (debug_print) { // debug - PrintSubarray4D("After interpolation reverse-reorder", doutput); - } // debug - - unprocessed_idx = 0; - - // printf("reorder-restore 1-3D\n"); - curr_dims[0] = 0; - curr_dims[1] = 1; - curr_dims[2] = 2; - dinput2.project(curr_dims[0], curr_dims[1], curr_dims[2]); - dinput1.project(curr_dims[0], curr_dims[1], - curr_dims[2]); // reuse input1 as temp space - - // unprocessed_dims_subarray = SubArray<1, DIM, - // DeviceType>({(SIZE)hierarchy.unprocessed_n[unprocessed_idx]}, - // hierarchy.unprocessed_dims_d[unprocessed_idx]); - // ratio_r = SubArray<1, T, DeviceType>({hierarchy.dofs[curr_dims[2]][l]}, - // hierarchy.ratio[curr_dims[2]][l]); ratio_c = SubArray<1, T, - // DeviceType>({hierarchy.dofs[curr_dims[1]][l]}, - // hierarchy.ratio[curr_dims[1]][l]); ratio_f = SubArray<1, T, - // DeviceType>({hierarchy.dofs[curr_dims[0]][l]}, - // hierarchy.ratio[curr_dims[0]][l]); - - calc_coeff_pointers(hierarchy, curr_dims, l, dinput2, dcoarse, dcoeff_f, - dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc, - dcoeff_rcf); - - // gpk_rev( - // hierarchy, hierarchy.shapes_h[l], hierarchy.shapes_d[l], - // hierarchy.shapes_d[l + 1], dinput1.getLdd(), dinput2.getLdd(), - // hierarchy.unprocessed_n[unprocessed_idx], - // hierarchy.unprocessed_dims_d[unprocessed_idx], - // curr_dims[2], curr_dims[1], curr_dims[0], - // hierarchy.ratio[curr_dims[2]][l], hierarchy.ratio[curr_dims[1]][l], - // hierarchy.ratio[curr_dims[0]][l], dinput1.data(), dinput1.getLddv1(), - // dinput1.getLddv2(), dcoarse.data(), dcoarse.getLddv1(), - // dcoarse.getLddv2(), - // // null, lddv1, lddv2, - // dcoeff_f.data(), dcoeff_f.getLddv1(), dcoeff_f.getLddv2(), - // // null, lddv1, lddv2, - // dcoeff_c.data(), dcoeff_c.getLddv1(), dcoeff_c.getLddv2(), - // // null, lddv1, lddv2, - // dcoeff_r.data(), dcoeff_r.getLddv1(), dcoeff_r.getLddv2(), - // // null, lddv1, lddv2, - // dcoeff_cf.data(), dcoeff_cf.getLddv1(), dcoeff_cf.getLddv2(), - // // null, lddv1, lddv2, - // dcoeff_rf.data(), dcoeff_rf.getLddv1(), dcoeff_rf.getLddv2(), - // // null, lddv1, lddv2, - // dcoeff_rc.data(), dcoeff_rc.getLddv1(), dcoeff_rc.getLddv2(), - // // null, lddv1, lddv2, - // dcoeff_rcf.data(), dcoeff_rcf.getLddv1(), dcoeff_rcf.getLddv2(), - // // null, lddv1, lddv2, - // 0, 0, 0, - // hierarchy.dofs[curr_dims[2]][l], hierarchy.dofs[curr_dims[1]][l], - // hierarchy.dofs[curr_dims[0]][l], queue_idx, - // hierarchy.auto_tuning_cc[hierarchy.arch][hierarchy.precision][range_l]); - - // gpuErrchk(cudaDeviceSynchronize()); - GpkRev().Execute( - SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true), - SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true), - hierarchy.unprocessed_n[unprocessed_idx], - SubArray(hierarchy.unprocessed_dims[unprocessed_idx]), - // unprocessed_dims_subarray, - curr_dims[2], curr_dims[1], curr_dims[0], - // ratio_r, ratio_c, ratio_f, - SubArray(hierarchy.ratio_array[curr_dims[2]][l]), - SubArray(hierarchy.ratio_array[curr_dims[1]][l]), - SubArray(hierarchy.ratio_array[curr_dims[0]][l]), dinput1, dcoarse, - dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc, dcoeff_rcf, - 0, 0, 0, hierarchy.dofs[curr_dims[2]][l], hierarchy.dofs[curr_dims[1]][l], - hierarchy.dofs[curr_dims[0]][l], queue_idx); - // gpuErrchk(cudaDeviceSynchronize()); - - DIM D_reduced = D % 2 == 0 ? D - 1 : D - 2; - for (DIM d = 3; d < D_reduced; d += 2) { - // printf("reorder-reverse\n"); - // copy back to input2 for reordering again - // lwpk(hierarchy, hierarchy.shapes_h[l], hierarchy.shapes_d[l], - // dinput1.data(), dinput1.getLdd(), dinput2.data(), - // dinput2.getLdd(), queue_idx); - - // gpuErrchk(cudaDeviceSynchronize()); - LwpkReo().Execute(dinput1, dinput2, queue_idx); - // gpuErrchk(cudaDeviceSynchronize()); - - // printf("reorder-restore %u-%uD\n", d+1, d+2); - curr_dims[0] = 0; - curr_dims[1] = d; - curr_dims[2] = d + 1; - dinput2.project(curr_dims[0], curr_dims[1], curr_dims[2]); - dinput1.project(curr_dims[0], curr_dims[1], - curr_dims[2]); // reuse input1 as temp output - - calc_coeff_pointers(hierarchy, curr_dims, l, dinput2, dcoarse, dcoeff_f, - dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc, - dcoeff_rcf); - - unprocessed_idx += 2; - - // unprocessed_dims_subarray = SubArray<1, DIM, - // DeviceType>({(SIZE)hierarchy.unprocessed_n[unprocessed_idx]}, - // hierarchy.unprocessed_dims_d[unprocessed_idx]); - // ratio_r = SubArray<1, T, DeviceType>({hierarchy.dofs[curr_dims[2]][l]}, - // hierarchy.ratio[curr_dims[2]][l]); ratio_c = SubArray<1, T, - // DeviceType>({hierarchy.dofs[curr_dims[1]][l]}, - // hierarchy.ratio[curr_dims[1]][l]); ratio_f = SubArray<1, T, - // DeviceType>({hierarchy.dofs[curr_dims[0]][l]}, - // hierarchy.ratio[curr_dims[0]][l]); - - // gpk_rev( - // hierarchy, hierarchy.shapes_h[l], hierarchy.shapes_d[l], - // hierarchy.shapes_d[l + 1], dinput1.getLdd(), dinput2.getLdd(), - // hierarchy.unprocessed_n[unprocessed_idx], - // hierarchy.unprocessed_dims_d[unprocessed_idx], - // curr_dims[2], curr_dims[1], curr_dims[0], - // hierarchy.ratio[curr_dims[2]][l], hierarchy.ratio[curr_dims[1]][l], - // hierarchy.ratio[curr_dims[0]][l], dinput1.data(), dinput1.getLddv1(), - // dinput1.getLddv2(), dcoarse.data(), dcoarse.getLddv1(), - // dcoarse.getLddv2(), - // // null, lddv1, lddv2, - // dcoeff_f.data(), dcoeff_f.getLddv1(), dcoeff_f.getLddv2(), - // // null, lddv1, lddv2, - // dcoeff_c.data(), dcoeff_c.getLddv1(), dcoeff_c.getLddv2(), - // // null, lddv1, lddv2, - // dcoeff_r.data(), dcoeff_r.getLddv1(), dcoeff_r.getLddv2(), - // // null, lddv1, lddv2, - // dcoeff_cf.data(), dcoeff_cf.getLddv1(), dcoeff_cf.getLddv2(), - // // null, lddv1, lddv2, - // dcoeff_rf.data(), dcoeff_rf.getLddv1(), dcoeff_rf.getLddv2(), - // // null, lddv1, lddv2, - // dcoeff_rc.data(), dcoeff_rc.getLddv1(), dcoeff_rc.getLddv2(), - // // null, lddv1, lddv2, - // dcoeff_rcf.data(), dcoeff_rcf.getLddv1(), dcoeff_rcf.getLddv2(), - // // null, lddv1, lddv2, - // 0, 0, 0, - // hierarchy.dofs[curr_dims[2]][l], hierarchy.dofs[curr_dims[1]][l], - // hierarchy.dofs[curr_dims[0]][l], queue_idx, - // hierarchy.auto_tuning_cc[hierarchy.arch][hierarchy.precision][range_l]); - - // gpuErrchk(cudaDeviceSynchronize()); - GpkRev().Execute( - SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true), - SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true), - hierarchy.unprocessed_n[unprocessed_idx], - SubArray(hierarchy.unprocessed_dims[unprocessed_idx]), - // unprocessed_dims_subarray, - curr_dims[2], curr_dims[1], curr_dims[0], - // ratio_r, ratio_c, ratio_f, - SubArray(hierarchy.ratio_array[curr_dims[2]][l]), - SubArray(hierarchy.ratio_array[curr_dims[1]][l]), - SubArray(hierarchy.ratio_array[curr_dims[0]][l]), dinput1, dcoarse, - dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc, - dcoeff_rcf, 0, 0, 0, hierarchy.dofs[curr_dims[2]][l], - hierarchy.dofs[curr_dims[1]][l], hierarchy.dofs[curr_dims[0]][l], - queue_idx); - // gpuErrchk(cudaDeviceSynchronize()); - } - - // printf("coeff-restore %u-%dD\n", D_reduced+1, D_reduced+2); - curr_dims[0] = 0; - curr_dims[1] = D_reduced; - curr_dims[2] = D_reduced + 1; - dinput1.project(curr_dims[0], curr_dims[1], curr_dims[2]); - doutput.project(curr_dims[0], curr_dims[1], curr_dims[2]); - calc_coeff_pointers(hierarchy, curr_dims, l, dinput1, dcoarse, dcoeff_f, - dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc, - dcoeff_rcf); - - if (D - D_reduced == 1) { - // printf("coeff-restore %u-%dD\n", D_reduced+1, D_reduced+1); - unprocessed_idx += 1; - - // unprocessed_dims_subarray = SubArray<1, DIM, - // DeviceType>({(SIZE)hierarchy.unprocessed_n[unprocessed_idx]}, - // hierarchy.unprocessed_dims_d[unprocessed_idx]); - // ratio_r = SubArray<1, T, DeviceType>({hierarchy.dofs[curr_dims[2]][l]}, - // hierarchy.ratio[curr_dims[2]][l]); ratio_c = SubArray<1, T, - // DeviceType>({hierarchy.dofs[curr_dims[1]][l]}, - // hierarchy.ratio[curr_dims[1]][l]); ratio_f = SubArray<1, T, - // DeviceType>({hierarchy.dofs[curr_dims[0]][l]}, - // hierarchy.ratio[curr_dims[0]][l]); - - // gpk_rev( - // hierarchy, hierarchy.shapes_h[l], hierarchy.shapes_d[l], - // hierarchy.shapes_d[l + 1], doutput.getLdd(), dinput1.getLdd(), - // hierarchy.unprocessed_n[unprocessed_idx], - // hierarchy.unprocessed_dims_d[unprocessed_idx], - // curr_dims[2], curr_dims[1], curr_dims[0], - // hierarchy.ratio[curr_dims[2]][l], hierarchy.ratio[curr_dims[1]][l], - // hierarchy.ratio[curr_dims[0]][l], doutput.data(), doutput.getLddv1(), - // doutput.getLddv2(), dcoarse.data(), dcoarse.getLddv1(), - // dcoarse.getLddv2(), - // // null, lddv1, lddv2, - // dcoeff_f.data(), dcoeff_f.getLddv1(), dcoeff_f.getLddv2(), - // // null, lddv1, lddv2, - // dcoeff_c.data(), dcoeff_c.getLddv1(), dcoeff_c.getLddv2(), - // // null, lddv1, lddv2, - // dcoeff_r.data(), dcoeff_r.getLddv1(), dcoeff_r.getLddv2(), - // // null, lddv1, lddv2, - // dcoeff_cf.data(), dcoeff_cf.getLddv1(), dcoeff_cf.getLddv2(), - // // null, lddv1, lddv2, - // dcoeff_rf.data(), dcoeff_rf.getLddv1(), dcoeff_rf.getLddv2(), - // // null, lddv1, lddv2, - // dcoeff_rc.data(), dcoeff_rc.getLddv1(), dcoeff_rc.getLddv2(), - // // null, lddv1, lddv2, - // dcoeff_rcf.data(), dcoeff_rcf.getLddv1(), dcoeff_rcf.getLddv2(), - // // null, lddv1, lddv2, - // 0, 0, 0, - // hierarchy.dofs[curr_dims[2]][l], hierarchy.dofs[curr_dims[1]][l], - // hierarchy.dofs[curr_dims[0]][l], queue_idx, - // hierarchy.auto_tuning_cc[hierarchy.arch][hierarchy.precision][range_l]); - - // gpuErrchk(cudaDeviceSynchronize()); - GpkRev().Execute( - SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true), - SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true), - hierarchy.unprocessed_n[unprocessed_idx], - SubArray(hierarchy.unprocessed_dims[unprocessed_idx]), - // unprocessed_dims_subarray, - curr_dims[2], curr_dims[1], curr_dims[0], - // ratio_r, ratio_c, ratio_f, - SubArray(hierarchy.ratio_array[curr_dims[2]][l]), - SubArray(hierarchy.ratio_array[curr_dims[1]][l]), - SubArray(hierarchy.ratio_array[curr_dims[0]][l]), doutput, dcoarse, - dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc, - dcoeff_rcf, 0, 0, 0, hierarchy.dofs[curr_dims[2]][l], - hierarchy.dofs[curr_dims[1]][l], hierarchy.dofs[curr_dims[0]][l], - queue_idx); - // gpuErrchk(cudaDeviceSynchronize()); - } else { // D - D_reduced >= 2 - // printf("coeff-restore %u-%dD\n", D_reduced+1, D_reduced+2); - unprocessed_idx += 2; - - // unprocessed_dims_subarray = SubArray<1, DIM, - // DeviceType>({(SIZE)hierarchy.unprocessed_n[unprocessed_idx]}, - // hierarchy.unprocessed_dims_d[unprocessed_idx]); - // ratio_r = SubArray<1, T, DeviceType>({hierarchy.dofs[curr_dims[2]][l]}, - // hierarchy.ratio[curr_dims[2]][l]); ratio_c = SubArray<1, T, - // DeviceType>({hierarchy.dofs[curr_dims[1]][l]}, - // hierarchy.ratio[curr_dims[1]][l]); ratio_f = SubArray<1, T, - // DeviceType>({hierarchy.dofs[curr_dims[0]][l]}, - // hierarchy.ratio[curr_dims[0]][l]); - - // gpk_rev( - // hierarchy, hierarchy.shapes_h[l], hierarchy.shapes_d[l], - // hierarchy.shapes_d[l + 1], doutput.getLdd(), dinput1.getLdd(), - // hierarchy.unprocessed_n[unprocessed_idx], - // hierarchy.unprocessed_dims_d[unprocessed_idx], - // curr_dims[2], curr_dims[1], curr_dims[0], - // hierarchy.ratio[curr_dims[2]][l], hierarchy.ratio[curr_dims[1]][l], - // hierarchy.ratio[curr_dims[0]][l], doutput.data(), doutput.getLddv1(), - // doutput.getLddv2(), dcoarse.data(), dcoarse.getLddv1(), - // dcoarse.getLddv2(), - // // null, lddv1, lddv2, - // dcoeff_f.data(), dcoeff_f.getLddv1(), dcoeff_f.getLddv2(), - // // null, lddv1, lddv2, - // dcoeff_c.data(), dcoeff_c.getLddv1(), dcoeff_c.getLddv2(), - // // null, lddv1, lddv2, - // dcoeff_r.data(), dcoeff_r.getLddv1(), dcoeff_r.getLddv2(), - // // null, lddv1, lddv2, - // dcoeff_cf.data(), dcoeff_cf.getLddv1(), dcoeff_cf.getLddv2(), - // // null, lddv1, lddv2, - // dcoeff_rf.data(), dcoeff_rf.getLddv1(), dcoeff_rf.getLddv2(), - // // null, lddv1, lddv2, - // dcoeff_rc.data(), dcoeff_rc.getLddv1(), dcoeff_rc.getLddv2(), - // // null, lddv1, lddv2, - // dcoeff_rcf.data(), dcoeff_rcf.getLddv1(), dcoeff_rcf.getLddv2(), - // // null, lddv1, lddv2, - // 0, 0, 0, - // hierarchy.dofs[curr_dims[2]][l], hierarchy.dofs[curr_dims[1]][l], - // hierarchy.dofs[curr_dims[0]][l], queue_idx, - // hierarchy.auto_tuning_cc[hierarchy.arch][hierarchy.precision][range_l]); - - // gpuErrchk(cudaDeviceSynchronize()); - GpkRev().Execute( - SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true), - SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true), - hierarchy.unprocessed_n[unprocessed_idx], - SubArray(hierarchy.unprocessed_dims[unprocessed_idx]), - // unprocessed_dims_subarray, - curr_dims[2], curr_dims[1], curr_dims[0], - // ratio_r, ratio_c, ratio_f, - SubArray(hierarchy.ratio_array[curr_dims[2]][l]), - SubArray(hierarchy.ratio_array[curr_dims[1]][l]), - SubArray(hierarchy.ratio_array[curr_dims[0]][l]), doutput, dcoarse, - dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc, - dcoeff_rcf, 0, 0, 0, hierarchy.dofs[curr_dims[2]][l], - hierarchy.dofs[curr_dims[1]][l], hierarchy.dofs[curr_dims[0]][l], - queue_idx); - // gpuErrchk(cudaDeviceSynchronize()); - } - - if (debug_print) { // debug - PrintSubarray4D("After coeff restore", doutput); - } // debug -} - -template -void calc_correction_nd(Hierarchy &hierarchy, - SubArray dcoeff, - SubArray &dcorrection, SIZE l, - int queue_idx) { - int range_l = std::min(6, (int)std::log2(hierarchy.dofs[0][l]) - 1); - int range_lp1 = std::min(6, (int)std::log2(hierarchy.dofs[0][l + 1]) - 1); - - std::string prefix = "decomp_"; - if (sizeof(T) == sizeof(double)) - prefix += "d_"; - if (sizeof(T) == sizeof(float)) - prefix += "f_"; - for (int d = 0; d < D; d++) - prefix += std::to_string(hierarchy.shape[d]) + "_"; - - SubArray dw_in1 = dcoeff; - SubArray dw_in2 = dcoeff; - SubArray dw_out = dcorrection; - - // start correction calculation - int prev_dim_r, prev_dim_c, prev_dim_f; - int curr_dim_f = 0, curr_dim_c = 1, curr_dim_r = 2; - - dw_in1.resize(curr_dim_f, hierarchy.dofs[curr_dim_f][l + 1]); - dw_in2.offset(curr_dim_f, hierarchy.dofs[curr_dim_f][l + 1]); - dw_in2.resize(curr_dim_f, hierarchy.dofs[curr_dim_f][l] - - hierarchy.dofs[curr_dim_f][l + 1]); - dw_out.resize(curr_dim_f, hierarchy.dofs[curr_dim_f][l + 1]); - - dw_in1.project(curr_dim_f, curr_dim_c, curr_dim_r); - dw_in2.project(curr_dim_f, curr_dim_c, curr_dim_r); - dw_out.project(curr_dim_f, curr_dim_c, curr_dim_r); - - // printf("mass trans 1D\n"); - // lpk_reo_1( - // hierarchy, hierarchy.shapes_h[l], hierarchy.shapes_h[l + 1], - // hierarchy.shapes_d[l], hierarchy.shapes_d[l + 1], dw_in1.getLdd(), - // dw_out.getLdd(), hierarchy.processed_n[0], - // hierarchy.processed_dims_h[0], hierarchy.processed_dims_d[0], - // curr_dim_r, curr_dim_c, curr_dim_f, hierarchy.dist[curr_dim_f][l], - // hierarchy.ratio[curr_dim_f][l], dw_in1.data(), dw_in1.getLddv1(), - // dw_in1.getLddv2(), dw_in2.data(), dw_in2.getLddv1(), dw_in2.getLddv2(), - // dw_out.data(), dw_out.getLddv1(), dw_out.getLddv2(), queue_idx, - // hierarchy.auto_tuning_mr1[hierarchy.arch][hierarchy.precision][range_lp1]); - - // SubArray<1, T, DeviceType> dist_f = SubArray<1, T, - // DeviceType>({hierarchy.dofs[curr_dim_f][l]}, - // hierarchy.dist[curr_dim_f][l]); SubArray<1, T, DeviceType> ratio_f = - // SubArray<1, T, DeviceType>({hierarchy.dofs[curr_dim_f][l]}, - // hierarchy.ratio[curr_dim_f][l]); gpuErrchk(cudaDeviceSynchronize()); - Lpk1Reo().Execute( - SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true), - SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true), - hierarchy.processed_n[0], - SubArray<1, SIZE, DeviceType>(hierarchy.processed_dims[0], true), - curr_dim_r, curr_dim_c, curr_dim_f, - // dist_f, ratio_f, - SubArray(hierarchy.dist_array[curr_dim_f][l]), - SubArray(hierarchy.ratio_array[curr_dim_f][l]), dw_in1, dw_in2, dw_out, - 0); - // gpuErrchk(cudaDeviceSynchronize()); - - if (debug_print) { // debug - PrintSubarray4D(format("decomposition: after MR-1D[{}]", l), dw_out); - } - - // mass trans 2D - prev_dim_f = curr_dim_f; - prev_dim_c = curr_dim_c; - prev_dim_r = curr_dim_r; - curr_dim_f = 0, curr_dim_c = 1, curr_dim_r = 2; - - dw_in1 = dw_out; - dw_in2 = dw_out; - dw_in1.resize(curr_dim_c, hierarchy.dofs[curr_dim_c][l + 1]); - dw_in2.offset(curr_dim_c, hierarchy.dofs[curr_dim_c][l + 1]); - dw_in2.resize(curr_dim_c, hierarchy.dofs[curr_dim_c][l] - - hierarchy.dofs[curr_dim_c][l + 1]); - dw_out.offset(prev_dim_f, hierarchy.dofs[curr_dim_f][l + 1]); - dw_out.resize(curr_dim_c, hierarchy.dofs[curr_dim_c][l + 1]); - - dw_in1.project(curr_dim_f, curr_dim_c, curr_dim_r); - dw_in2.project(curr_dim_f, curr_dim_c, curr_dim_r); - dw_out.project(curr_dim_f, curr_dim_c, curr_dim_r); - - // printf("mass trans 2D\n"); - // lpk_reo_2( - // hierarchy, hierarchy.shapes_h[l], hierarchy.shapes_h[l + 1], - // hierarchy.shapes_d[l], hierarchy.shapes_d[l + 1], - // dw_in1.getLdd(), dw_out.getLdd(), - // hierarchy.processed_n[1], hierarchy.processed_dims_h[1], - // hierarchy.processed_dims_d[1], - // curr_dim_r, curr_dim_c, curr_dim_f, - // hierarchy.dist[curr_dim_c][l], hierarchy.ratio[curr_dim_c][l], - // dw_in1.data(), dw_in1.getLddv1(), dw_in1.getLddv2(), - // dw_in2.data(), dw_in2.getLddv1(), dw_in2.getLddv2(), - // dw_out.data(), dw_out.getLddv1(), dw_out.getLddv2(), queue_idx, - // hierarchy.auto_tuning_mr1[hierarchy.arch][hierarchy.precision][range_lp1]); - - // gpuErrchk(cudaDeviceSynchronize()); - Lpk2Reo().Execute( - SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true), - SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true), - hierarchy.processed_n[1], - SubArray<1, SIZE, DeviceType>(hierarchy.processed_dims[1], true), - curr_dim_r, curr_dim_c, curr_dim_f, - // dist_f, ratio_f, - SubArray(hierarchy.dist_array[curr_dim_c][l]), - SubArray(hierarchy.ratio_array[curr_dim_c][l]), dw_in1, dw_in2, dw_out, - 0); - // gpuErrchk(cudaDeviceSynchronize()); - - if (debug_print) { // debug - PrintSubarray4D(format("decomposition: after MR-2D[{}]", l), dw_out); - } - - // mass trans 3D - - prev_dim_f = curr_dim_f; - prev_dim_c = curr_dim_c; - prev_dim_r = curr_dim_r; - curr_dim_f = 0, curr_dim_c = 1, curr_dim_r = 2; - - dw_in1 = dw_out; - dw_in2 = dw_out; - dw_in1.resize(curr_dim_r, hierarchy.dofs[curr_dim_r][l + 1]); - dw_in2.offset(curr_dim_r, hierarchy.dofs[curr_dim_r][l + 1]); - dw_in2.resize(curr_dim_r, hierarchy.dofs[curr_dim_r][l] - - hierarchy.dofs[curr_dim_r][l + 1]); - dw_out.offset(prev_dim_c, hierarchy.dofs[curr_dim_c][l + 1]); - dw_out.resize(curr_dim_r, hierarchy.dofs[curr_dim_r][l + 1]); - - dw_in1.project(curr_dim_f, curr_dim_c, curr_dim_r); - dw_in2.project(curr_dim_f, curr_dim_c, curr_dim_r); - dw_out.project(curr_dim_f, curr_dim_c, curr_dim_r); - - // printf("mass trans 3D\n"); - // lpk_reo_3( - // hierarchy, hierarchy.shapes_h[l], hierarchy.shapes_h[l + 1], - // hierarchy.shapes_d[l], hierarchy.shapes_d[l + 1], - // dw_in1.getLdd(), dw_out.getLdd(), - // hierarchy.processed_n[2], hierarchy.processed_dims_h[2], - // hierarchy.processed_dims_d[2], - // curr_dim_r, curr_dim_c, curr_dim_f, - // hierarchy.dist[curr_dim_r][l], hierarchy.ratio[curr_dim_r][l], - // dw_in1.data(), dw_in1.getLddv1(), dw_in1.getLddv2(), - // dw_in2.data(), dw_in2.getLddv1(), dw_in2.getLddv2(), - // dw_out.data(), dw_out.getLddv1(), dw_out.getLddv2(), queue_idx, - // hierarchy.auto_tuning_mr1[hierarchy.arch][hierarchy.precision][range_lp1]); - - // gpuErrchk(cudaDeviceSynchronize()); - Lpk3Reo().Execute( - SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true), - SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true), - hierarchy.processed_n[2], - SubArray<1, SIZE, DeviceType>(hierarchy.processed_dims[2], true), - curr_dim_r, curr_dim_c, curr_dim_f, - // dist_f, ratio_f, - SubArray(hierarchy.dist_array[curr_dim_r][l]), - SubArray(hierarchy.ratio_array[curr_dim_r][l]), dw_in1, dw_in2, dw_out, - 0); - // gpuErrchk(cudaDeviceSynchronize()); - - if (debug_print) { // debug - PrintSubarray4D(format("decomposition: after MR-3D[{}]", l), dw_out); - } - - // mass trans 4D+ - for (int i = 3; i < D; i++) { - prev_dim_f = curr_dim_f; - prev_dim_c = curr_dim_c; - prev_dim_r = curr_dim_r; - curr_dim_f = 0, curr_dim_c = 1, curr_dim_r = i; - dw_in1 = dw_out; - dw_in2 = dw_out; - dw_in1.resize(curr_dim_r, hierarchy.dofs[curr_dim_r][l + 1]); - dw_in2.offset(curr_dim_r, hierarchy.dofs[curr_dim_r][l + 1]); - dw_in2.resize(curr_dim_r, hierarchy.dofs[curr_dim_r][l] - - hierarchy.dofs[curr_dim_r][l + 1]); - dw_out.offset(prev_dim_r, hierarchy.dofs[prev_dim_r][l + 1]); - dw_out.resize(curr_dim_r, hierarchy.dofs[curr_dim_r][l + 1]); - - dw_in1.project(curr_dim_f, curr_dim_c, curr_dim_r); - dw_in2.project(curr_dim_f, curr_dim_c, curr_dim_r); - dw_out.project(curr_dim_f, curr_dim_c, curr_dim_r); - - // printf("mass trans %dD\n", i+1); - // lpk_reo_3( - // hierarchy, hierarchy.shapes_h[l], hierarchy.shapes_h[l + 1], - // hierarchy.shapes_d[l], hierarchy.shapes_d[l + 1], - // dw_in1.getLdd(), dw_out.getLdd(), - // hierarchy.processed_n[i], hierarchy.processed_dims_h[i], - // hierarchy.processed_dims_d[i], - // curr_dim_r, curr_dim_c, curr_dim_f, - // hierarchy.dist[curr_dim_r][l], hierarchy.ratio[curr_dim_r][l], - // dw_in1.data(), dw_in1.getLddv1(), dw_in1.getLddv2(), - // dw_in2.data(), dw_in2.getLddv1(), dw_in2.getLddv2(), - // dw_out.data(), dw_out.getLddv1(), dw_out.getLddv2(), queue_idx, - // hierarchy.auto_tuning_mr1[hierarchy.arch][hierarchy.precision][range_lp1]); - - // gpuErrchk(cudaDeviceSynchronize()); - Lpk3Reo().Execute( - SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l], true), - SubArray<1, SIZE, DeviceType>(hierarchy.shapes[l + 1], true), - hierarchy.processed_n[i], - SubArray<1, SIZE, DeviceType>(hierarchy.processed_dims[i], true), - curr_dim_r, curr_dim_c, curr_dim_f, - // dist_f, ratio_f, - SubArray(hierarchy.dist_array[curr_dim_r][l]), - SubArray(hierarchy.ratio_array[curr_dim_r][l]), dw_in1, dw_in2, dw_out, - 0); - // gpuErrchk(cudaDeviceSynchronize()); - - if (debug_print) { // debug - PrintSubarray4D(format("decomposition: after MR-{}D[{}]", i + 1, l), - dw_out); - } - } - - curr_dim_f = 0, curr_dim_c = 1, curr_dim_r = 2; - dw_in1.project(curr_dim_f, curr_dim_c, curr_dim_r); - dw_in2.project(curr_dim_f, curr_dim_c, curr_dim_r); - dw_out.project(curr_dim_f, curr_dim_c, curr_dim_r); - - // printf("solve tridiag 1D\n"); - // ipk_1( - // hierarchy, hierarchy.shapes_h[l], hierarchy.shapes_h[l + 1], - // hierarchy.shapes_d[l], hierarchy.shapes_d[l + 1], - // dw_out.getLdd(), dw_out.getLdd(), - // hierarchy.processed_n[0], hierarchy.processed_dims_h[0], - // hierarchy.processed_dims_d[0], - // curr_dim_r, curr_dim_c, curr_dim_f, - // hierarchy.am[curr_dim_f][l + 1], hierarchy.bm[curr_dim_f][l + 1], - // hierarchy.dist[curr_dim_f][l + 1], - // dw_out.data(), dw_out.getLddv1(), dw_out.getLddv2(), queue_idx, - // hierarchy.auto_tuning_ts1[hierarchy.arch][hierarchy.precision][range_lp1]); - - // gpuErrchk(cudaDeviceSynchronize()); - Ipk1Reo().Execute( - curr_dim_r, curr_dim_c, curr_dim_f, - SubArray(hierarchy.am_array[curr_dim_f][l + 1]), - SubArray(hierarchy.bm_array[curr_dim_f][l + 1]), dw_out, 0); - // gpuErrchk(cudaDeviceSynchronize()); - - if (debug_print) { // debug - PrintSubarray4D(format("decomposition: after TR-1D[{}]", l), dw_out); - } // debug - - curr_dim_f = 0, curr_dim_c = 1, curr_dim_r = 2; - dw_in1.project(curr_dim_f, curr_dim_c, curr_dim_r); - dw_in2.project(curr_dim_f, curr_dim_c, curr_dim_r); - dw_out.project(curr_dim_f, curr_dim_c, curr_dim_r); - - // printf("solve tridiag 2D\n"); - // ipk_2( - // hierarchy, hierarchy.shapes_h[l], hierarchy.shapes_h[l + 1], - // hierarchy.shapes_d[l], hierarchy.shapes_d[l + 1], - // dw_out.getLdd(), dw_out.getLdd(), - // hierarchy.processed_n[1], hierarchy.processed_dims_h[1], - // hierarchy.processed_dims_d[1], - // curr_dim_r, curr_dim_c, curr_dim_f, - // hierarchy.am[curr_dim_c][l + 1], hierarchy.bm[curr_dim_c][l + 1], - // hierarchy.dist[curr_dim_c][l + 1], - // dw_out.data(), dw_out.getLddv1(), dw_out.getLddv2(), queue_idx, - // hierarchy.auto_tuning_ts1[hierarchy.arch][hierarchy.precision][range_lp1]); - - // gpuErrchk(cudaDeviceSynchronize()); - Ipk2Reo().Execute( - curr_dim_r, curr_dim_c, curr_dim_f, - SubArray(hierarchy.am_array[curr_dim_c][l + 1]), - SubArray(hierarchy.bm_array[curr_dim_c][l + 1]), - // SubArray(hierarchy.dist_array[curr_dim_f][l+1]), - dw_out, 0); - // gpuErrchk(cudaDeviceSynchronize()); - - if (debug_print) { // debug - PrintSubarray4D(format("decomposition: after TR-2D[{}]", l), dw_out); - } // debug - - curr_dim_f = 0, curr_dim_c = 1, curr_dim_r = 2; - dw_in1.project(curr_dim_f, curr_dim_c, curr_dim_r); - dw_in2.project(curr_dim_f, curr_dim_c, curr_dim_r); - dw_out.project(curr_dim_f, curr_dim_c, curr_dim_r); - - // printf("solve tridiag 3D\n"); - // ipk_3( - // hierarchy, hierarchy.shapes_h[l], hierarchy.shapes_h[l + 1], - // hierarchy.shapes_d[l], hierarchy.shapes_d[l + 1], - // dw_out.getLdd(), dw_out.getLdd(), - // hierarchy.processed_n[2], hierarchy.processed_dims_h[2], - // hierarchy.processed_dims_d[2], curr_dim_r, curr_dim_c, curr_dim_f, - // hierarchy.am[curr_dim_r][l + 1], hierarchy.bm[curr_dim_r][l + 1], - // hierarchy.dist[curr_dim_r][l + 1], - // dw_out.data(), dw_out.getLddv1(), dw_out.getLddv2(), queue_idx, - // hierarchy.auto_tuning_ts1[hierarchy.arch][hierarchy.precision][range_lp1]); - - // gpuErrchk(cudaDeviceSynchronize()); - Ipk3Reo().Execute( - curr_dim_r, curr_dim_c, curr_dim_f, - SubArray(hierarchy.am_array[curr_dim_r][l + 1]), - SubArray(hierarchy.bm_array[curr_dim_r][l + 1]), - // SubArray(hierarchy.dist_array[curr_dim_f][l+1]), - dw_out, 0); - // gpuErrchk(cudaDeviceSynchronize()); - - if (debug_print) { // debug - PrintSubarray4D(format("decomposition: after TR-3D[{}]", l), dw_out); - } // debug - - // mass trans 4D+ - for (int i = 3; i < D; i++) { - curr_dim_f = 0, curr_dim_c = 1, curr_dim_r = i; - dw_in1.project(curr_dim_f, curr_dim_c, curr_dim_r); - dw_in2.project(curr_dim_f, curr_dim_c, curr_dim_r); - dw_out.project(curr_dim_f, curr_dim_c, curr_dim_r); - // printf("solve tridiag %dD\n", i+1); - // ipk_3( - // hierarchy, hierarchy.shapes_h[l], hierarchy.shapes_h[l + 1], - // hierarchy.shapes_d[l], hierarchy.shapes_d[l + 1], - // dw_out.getLdd(), dw_out.getLdd(), - // hierarchy.processed_n[i], hierarchy.processed_dims_h[i], - // hierarchy.processed_dims_d[i], curr_dim_r, curr_dim_c, curr_dim_f, - // hierarchy.am[curr_dim_r][l + 1], hierarchy.bm[curr_dim_r][l + 1], - // hierarchy.dist[curr_dim_r][l + 1], - // dw_out.data(), dw_out.getLddv1(), dw_out.getLddv2(), queue_idx, - // hierarchy.auto_tuning_ts1[hierarchy.arch][hierarchy.precision][range_lp1]); - - // gpuErrchk(cudaDeviceSynchronize()); - Ipk3Reo().Execute( - curr_dim_r, curr_dim_c, curr_dim_f, - SubArray(hierarchy.am_array[curr_dim_r][l + 1]), - SubArray(hierarchy.bm_array[curr_dim_r][l + 1]), - // SubArray(hierarchy.dist_array[curr_dim_f][l+1]), - dw_out, 0); - // gpuErrchk(cudaDeviceSynchronize()); - if (debug_print) { // debug - PrintSubarray4D(format("decomposition: after TR-{}D[{}]", i + 1, l), - dw_out); - } // debug - } - - dcorrection = dw_out; -} - template void decompose(Hierarchy &hierarchy, SubArray &v, SIZE l_target, int queue_idx) { @@ -1710,85 +41,71 @@ void decompose(Hierarchy &hierarchy, SubArray w_correction = w; SubArray v_coarse = v; - if (D <= 3) { + if constexpr (D <= 3) { for (int l = 0; l < l_target; ++l) { - if (debug_print) { + if (multidim_refactoring_debug_print) { PrintSubarray("input v", v); } - // DeviceRuntime::SyncDevice(); v_fine.resize(hierarchy.shapes2[l]); w_fine.resize(hierarchy.shapes2[l]); - LwpkReo().Execute(v_fine, w_fine, queue_idx); - // DeviceRuntime::SyncDevice(); + CopyND(v_fine, w_fine, queue_idx); + v_coeff.resize(hierarchy.shapes2[l]); - calc_coefficients_3d(hierarchy, w_fine, v_coeff, l, queue_idx); - // DeviceRuntime::SyncDevice(); + CalcCoefficients3D(hierarchy, w_fine, v_coeff, l, queue_idx); + w_correction.resize(hierarchy.shapes2[l]); - calc_correction_3d(hierarchy, v_coeff, w_correction, l, queue_idx); - // DeviceRuntime::SyncDevice(); + CalcCorrection3D(hierarchy, v_coeff, w_correction, l, queue_idx); w_correction.resize(hierarchy.shapes2[l + 1]); v_coarse.resize(hierarchy.shapes2[l + 1]); - LwpkReo().Execute(w_correction, v_coarse, - queue_idx); - // DeviceRuntime::SyncDevice(); - if (debug_print) { + AddND(w_correction, v_coarse, queue_idx); + if (multidim_refactoring_debug_print) { PrintSubarray("after add", v); } } // end of loop - if (debug_print) { + if (multidim_refactoring_debug_print) { PrintSubarray("output of decomposition", v); } } - if (D > 3) { + if constexpr (D > 3) { Array workspace2(workspace_shape); SubArray b(workspace2); SubArray b_fine = b; for (int l = 0; l < l_target; ++l) { - if (debug_print) { // debug + if (multidim_refactoring_debug_print) { // debug PrintSubarray4D("before coeff", v); } - // std::vector shape(hierarchy.D_padded); - // for (DIM d = 0; d < hierarchy.D_padded; d++) shape[d] = - // hierarchy.shapes_h[l][d]; - - // gpuErrchk(cudaDeviceSynchronize()); - v_fine.resize(hierarchy.shapes2[l]); w_fine.resize(hierarchy.shapes2[l]); - LwpkReo().Execute(v_fine, w_fine, queue_idx); + CopyND(v_fine, w_fine, queue_idx); v_fine.resize(hierarchy.shapes2[l]); b_fine.resize(hierarchy.shapes2[l]); - LwpkReo().Execute(v_fine, b_fine, queue_idx); - // gpuErrchk(cudaDeviceSynchronize()); + CopyND(v_fine, b_fine, queue_idx); + v_coeff.resize(hierarchy.shapes2[l]); - calc_coefficients_nd(hierarchy, w_fine, b_fine, v_coeff, l, queue_idx); + CalcCoefficientsND(hierarchy, w_fine, b_fine, v_coeff, l, queue_idx); - if (debug_print) { // debug + if (multidim_refactoring_debug_print) { // debug PrintSubarray4D(format("after coeff[%d]", l), v_coeff); } // debug - // gpuErrchk(cudaDeviceSynchronize()); w_correction.resize(hierarchy.shapes2[l]); - calc_correction_nd(hierarchy, v_coeff, w_correction, l, 0); - // gpuErrchk(cudaDeviceSynchronize()); + CalcCorrectionND(hierarchy, v_coeff, w_correction, l, queue_idx); w_correction.resize(hierarchy.shapes2[l + 1]); v_coarse.resize(hierarchy.shapes2[l + 1]); - LwpkReo().Execute(w_correction, v_coarse, - queue_idx); - // gpuErrchk(cudaDeviceSynchronize()); - - if (debug_print) { // debug + AddND(w_correction, v_coarse, queue_idx); + if (multidim_refactoring_debug_print) { // debug PrintSubarray4D(format("after apply correction[%d]", l), v); } // debug } } + DeviceRuntime::SyncDevice(); } template @@ -1809,8 +126,8 @@ void recompose(Hierarchy &hierarchy, SubArray w_correction = w; SubArray v_coarse = v; - if (D <= 3) { - if (debug_print) { + if constexpr (D <= 3) { + if (multidim_refactoring_debug_print) { PrintSubarray("input of recomposition", v); } std::string prefix = "recomp_"; @@ -1823,36 +140,32 @@ void recompose(Hierarchy &hierarchy, // std::cout << prefix << std::endl; for (int l = l_target - 1; l >= 0; l--) { - v_coeff.resize(hierarchy.shapes2[l]); w_correction.resize(hierarchy.shapes2[l]); - calc_correction_3d(hierarchy, v_coeff, w_correction, l, 0); + CalcCorrection3D(hierarchy, v_coeff, w_correction, l, queue_idx); w_correction.resize(hierarchy.shapes2[l + 1]); v_coarse.resize(hierarchy.shapes2[l + 1]); - LwpkReo().Execute(w_correction, v_coarse, - queue_idx); + SubtractND(w_correction, v_coarse, queue_idx); v_coeff.resize(hierarchy.shapes2[l]); w_fine.resize(hierarchy.shapes2[l]); - coefficients_restore_3d(hierarchy, v_coeff, w_fine, l, 0); + CoefficientsRestore3D(hierarchy, v_coeff, w_fine, l, queue_idx); v_fine.resize(hierarchy.shapes2[l]); - LwpkReo().Execute(w_fine, v_fine, queue_idx); - // gpuErrchk(cudaDeviceSynchronize()); - - if (debug_print) { + CopyND(w_fine, v_fine, queue_idx); + if (multidim_refactoring_debug_print) { PrintSubarray("output of recomposition", v); } } } - if (D > 3) { + if constexpr (D > 3) { Array workspace2(workspace_shape); SubArray b(workspace2); SubArray b_fine = b; for (int l = l_target - 1; l >= 0; l--) { - if (debug_print) { // debug + if (multidim_refactoring_debug_print) { // debug PrintSubarray4D(format("before corection[%d]", l), v); } @@ -1860,50 +173,39 @@ void recompose(Hierarchy &hierarchy, int lddv1, lddv2; int lddw1, lddw2; int lddb1, lddb2; - // un-apply correction - // std::vector shape(hierarchy.D_padded); - // for (DIM d = 0; d < hierarchy.D_padded; d++) shape[d] = - // hierarchy.shapes_h[l][d]; - if (debug_print) { // debug + if (multidim_refactoring_debug_print) { // debug PrintSubarray4D(format("before subtract correction[%d]", l), v); } // deb - // gpuErrchk(cudaDeviceSynchronize()); v_coeff.resize(hierarchy.shapes2[l]); w_correction.resize(hierarchy.shapes2[l]); - calc_correction_nd(hierarchy, v_coeff, w_correction, l, 0); + CalcCorrectionND(hierarchy, v_coeff, w_correction, l, queue_idx); w_correction.resize(hierarchy.shapes2[l + 1]); v_coarse.resize(hierarchy.shapes2[l + 1]); - // gpuErrchk(cudaDeviceSynchronize()); - LwpkReo().Execute(w_correction, v_coarse, - queue_idx); - // gpuErrchk(cudaDeviceSynchronize()); + SubtractND(w_correction, v_coarse, queue_idx); - if (debug_print) { // debug + if (multidim_refactoring_debug_print) { // debug PrintSubarray4D(format("after subtract correction[%d]", l), v); } // deb v_coeff.resize(hierarchy.shapes2[l]); w_fine.resize(hierarchy.shapes2[l]); b_fine.resize(hierarchy.shapes2[l]); - // gpuErrchk(cudaDeviceSynchronize()); - LwpkReo().Execute(v_coeff, b_fine, queue_idx); - LwpkReo().Execute(v_coeff, w_fine, queue_idx); - // gpuErrchk(cudaDeviceSynchronize()); + CopyND(v_coeff, b_fine, queue_idx); + CopyND(v_coeff, w_fine, queue_idx); v_fine.resize(hierarchy.shapes2[l]); - coefficients_restore_nd(hierarchy, w_fine, b_fine, v_fine, l, queue_idx); + CoefficientsRestoreND(hierarchy, w_fine, b_fine, v_fine, l, queue_idx); } // loop levels - if (debug_print) { // debug + if (multidim_refactoring_debug_print) { // debug std::vector shape(hierarchy.D_padded); - // for (DIM d = 0; d < hierarchy.D_padded; d++) shape[d] = - // hierarchy.shapes_h[0][d]; PrintSubarray4D(format("final output"), v); } // deb } // D > 3 + DeviceRuntime::SyncDevice(); } } // namespace mgard_x \ No newline at end of file diff --git a/include/mgard-x/DataRefactoring/SingleDimension/Coefficient/CalcCoefficients.hpp b/include/mgard-x/DataRefactoring/SingleDimension/Coefficient/CalcCoefficients.hpp new file mode 100644 index 0000000000..87a64199b2 --- /dev/null +++ b/include/mgard-x/DataRefactoring/SingleDimension/Coefficient/CalcCoefficients.hpp @@ -0,0 +1,32 @@ +/* + * Copyright 2022, Oak Ridge National Laboratory. + * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs + * Author: Jieyang Chen (chenj3@ornl.gov) + * Date: March 17, 2022 + */ + +#include "../../../Hierarchy/Hierarchy.hpp" +#include "../../../RuntimeX/RuntimeX.h" + +#include "../DataRefactoring.h" + +#include "CoefficientKernel.hpp" + +#ifndef MGARD_X_DATA_REFACTORING_CALC_COEFFICIENTS +#define MGARD_X_DATA_REFACTORING_CALC_COEFFICIENTS + +namespace mgard_x { + +template +void CalcCoefficients(DIM current_dim, SubArray<1, T, DeviceType> ratio, + SubArray v, + SubArray coarse, + SubArray coeff, int queue_idx) { + + SingleDimensionCoefficient().Execute( + current_dim, ratio, v, coarse, coeff, queue_idx); +} + +} // namespace mgard_x + +#endif \ No newline at end of file diff --git a/include/mgard-x/DataRefactoring/SingleDimension/Coefficient/CoefficientKernel.hpp b/include/mgard-x/DataRefactoring/SingleDimension/Coefficient/CoefficientKernel.hpp index c2e5355e06..16245fdf3a 100644 --- a/include/mgard-x/DataRefactoring/SingleDimension/Coefficient/CoefficientKernel.hpp +++ b/include/mgard-x/DataRefactoring/SingleDimension/Coefficient/CoefficientKernel.hpp @@ -195,13 +195,12 @@ class SingleDimensionCoefficient : public AutoTuner { SubArray v, SubArray coarse, SubArray coeff, int queue_idx) { int range_l = std::min(6, (int)std::log2(coeff.getShape(0)) - 1); - int arch = DeviceRuntime::GetArchitectureGeneration(); int prec = TypeToIdx(); int config = AutoTuner::autoTuningTable.gpk_reo_nd[prec][range_l]; - double min_time = std::numeric_limits::max(); int min_config = 0; + ExecutionReturn ret; #define GPK(CONFIG) \ if (config == CONFIG || AutoTuner::ProfileKernels) { \ @@ -214,22 +213,27 @@ class SingleDimensionCoefficient : public AutoTuner { TaskType task = \ GenTask(current_dim, ratio, v, coarse, coeff, queue_idx); \ DeviceAdapter adapter; \ - ExecutionReturn ret = adapter.Execute(task); \ + ret = adapter.Execute(task); \ if (AutoTuner::ProfileKernels) { \ - if (min_time > ret.execution_time) { \ + if (ret.success && min_time > ret.execution_time) { \ min_time = ret.execution_time; \ min_config = CONFIG; \ } \ } \ } - GPK(0) - GPK(1) - GPK(2) - GPK(3) - GPK(4) - GPK(5) - GPK(6) + GPK(6) if (!ret.success) config--; + GPK(5) if (!ret.success) config--; + GPK(4) if (!ret.success) config--; + GPK(3) if (!ret.success) config--; + GPK(2) if (!ret.success) config--; + GPK(1) if (!ret.success) config--; + GPK(0) if (!ret.success) config--; + if (config < 0 && !ret.success) { + std::cout << log::log_err + << "no suitable config for SingleDimensionCoefficient.\n"; + exit(-1); + } #undef GPK if (AutoTuner::ProfileKernels) { diff --git a/include/mgard-x/DataRefactoring/SingleDimension/Coefficient/CoefficientsRestore.hpp b/include/mgard-x/DataRefactoring/SingleDimension/Coefficient/CoefficientsRestore.hpp new file mode 100644 index 0000000000..c2c6c02517 --- /dev/null +++ b/include/mgard-x/DataRefactoring/SingleDimension/Coefficient/CoefficientsRestore.hpp @@ -0,0 +1,32 @@ +/* + * Copyright 2022, Oak Ridge National Laboratory. + * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs + * Author: Jieyang Chen (chenj3@ornl.gov) + * Date: March 17, 2022 + */ + +#include "../../../Hierarchy/Hierarchy.hpp" +#include "../../../RuntimeX/RuntimeX.h" + +#include "../DataRefactoring.h" + +#include "CoefficientKernel.hpp" + +#ifndef MGARD_X_DATA_REFACTORING_COEFFICIENTS_RESTORE +#define MGARD_X_DATA_REFACTORING_COEFFICIENTS_RESTORE + +namespace mgard_x { + +template +void CoefficientsRestore(DIM current_dim, SubArray<1, T, DeviceType> ratio, + SubArray v, + SubArray coarse, + SubArray coeff, int queue_idx) { + + SingleDimensionCoefficient().Execute( + current_dim, ratio, v, coarse, coeff, queue_idx); +} + +} // namespace mgard_x + +#endif \ No newline at end of file diff --git a/include/mgard-x/DataRefactoring/SingleDimension/Correction/CalcCorrection.hpp b/include/mgard-x/DataRefactoring/SingleDimension/Correction/CalcCorrection.hpp new file mode 100644 index 0000000000..ea95a4f0b9 --- /dev/null +++ b/include/mgard-x/DataRefactoring/SingleDimension/Correction/CalcCorrection.hpp @@ -0,0 +1,84 @@ +/* + * Copyright 2022, Oak Ridge National Laboratory. + * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs + * Author: Jieyang Chen (chenj3@ornl.gov) + * Date: March 17, 2022 + */ + +#include "../../../Hierarchy/Hierarchy.hpp" +#include "../../../RuntimeX/RuntimeX.h" + +#include "../DataRefactoring.h" + +#include "../../MultiDimension/Correction/IterativeProcessingKernel.hpp" +#include "../../MultiDimension/Correction/LevelwiseProcessingKernel.hpp" + +#include "MassTransKernel.hpp" + +#ifndef MGARD_X_DATA_REFACTORING_CALC_CORRECTION +#define MGARD_X_DATA_REFACTORING_CALC_CORRECTION + +namespace mgard_x { + +template +void CalcCorrection(Hierarchy &hierarchy, + SubArray &coeff, + SubArray &correction, SIZE curr_dim, + SIZE l, int queue_idx) { + + SingleDimensionMassTrans().Execute( + curr_dim, SubArray(hierarchy.dist_array[curr_dim][l]), + SubArray(hierarchy.ratio_array[curr_dim][l]), coeff, correction, + queue_idx); + + if (singledim_refactoring_debug_print) { + PrintSubarray("SingleDimensionMassTrans", correction); + } + + DIM curr_dim_f = 0, curr_dim_c = 1, curr_dim_r = 2; + if (curr_dim == 0) { + correction.project(curr_dim_f, curr_dim_c, curr_dim_r); + Ipk1Reo().Execute( + curr_dim_r, curr_dim_c, curr_dim_f, + SubArray(hierarchy.am_array[curr_dim_f][l + 1]), + SubArray(hierarchy.bm_array[curr_dim_f][l + 1]), correction, queue_idx); + if (singledim_refactoring_debug_print) { + PrintSubarray("Ipk1Reo", correction); + } + + } else if (curr_dim == 1) { + curr_dim_f = 0, curr_dim_c = 1, curr_dim_r = 2; + correction.project(curr_dim_f, curr_dim_c, curr_dim_r); + Ipk2Reo().Execute( + curr_dim_r, curr_dim_c, curr_dim_f, + SubArray(hierarchy.am_array[curr_dim_c][l + 1]), + SubArray(hierarchy.bm_array[curr_dim_c][l + 1]), correction, queue_idx); + if (singledim_refactoring_debug_print) { + PrintSubarray("Ipk2Reo", correction); + } + } else if (curr_dim == 2) { + curr_dim_f = 0, curr_dim_c = 1, curr_dim_r = 2; + correction.project(curr_dim_f, curr_dim_c, curr_dim_r); + Ipk3Reo().Execute( + curr_dim_r, curr_dim_c, curr_dim_f, + SubArray(hierarchy.am_array[curr_dim_r][l + 1]), + SubArray(hierarchy.bm_array[curr_dim_r][l + 1]), correction, queue_idx); + if (singledim_refactoring_debug_print) { + PrintSubarray("Ipk3Reo", correction); + } + } else { + curr_dim_f = 0, curr_dim_c = 1, curr_dim_r = curr_dim; + correction.project(curr_dim_f, curr_dim_c, curr_dim_r); + Ipk3Reo().Execute( + curr_dim_r, curr_dim_c, curr_dim_f, + SubArray(hierarchy.am_array[curr_dim_r][l + 1]), + SubArray(hierarchy.bm_array[curr_dim_r][l + 1]), correction, queue_idx); + if (singledim_refactoring_debug_print) { + PrintSubarray("Ipk3Reo", correction); + } + } +} + +} // namespace mgard_x + +#endif \ No newline at end of file diff --git a/include/mgard-x/DataRefactoring/SingleDimension/Correction/MassTransKernel.hpp b/include/mgard-x/DataRefactoring/SingleDimension/Correction/MassTransKernel.hpp index cb434ca5a3..079dceae1d 100644 --- a/include/mgard-x/DataRefactoring/SingleDimension/Correction/MassTransKernel.hpp +++ b/include/mgard-x/DataRefactoring/SingleDimension/Correction/MassTransKernel.hpp @@ -167,13 +167,12 @@ class SingleDimensionMassTrans : public AutoTuner { SubArray coeff, SubArray v, int queue_idx) { int range_l = std::min(6, (int)std::log2(v.getShape(0)) - 1); - int arch = DeviceRuntime::GetArchitectureGeneration(); int prec = TypeToIdx(); int config = AutoTuner::autoTuningTable.gpk_reo_nd[prec][range_l]; - double min_time = std::numeric_limits::max(); int min_config = 0; + ExecutionReturn ret; #define GPK(CONFIG) \ if (config == CONFIG || AutoTuner::ProfileKernels) { \ @@ -186,22 +185,27 @@ class SingleDimensionMassTrans : public AutoTuner { TaskType task = \ GenTask(current_dim, dist, ratio, coeff, v, queue_idx); \ DeviceAdapter adapter; \ - ExecutionReturn ret = adapter.Execute(task); \ + ret = adapter.Execute(task); \ if (AutoTuner::ProfileKernels) { \ - if (min_time > ret.execution_time) { \ + if (ret.success && min_time > ret.execution_time) { \ min_time = ret.execution_time; \ min_config = CONFIG; \ } \ } \ } - GPK(0) - GPK(1) - GPK(2) - GPK(3) - GPK(4) - GPK(5) - GPK(6) + GPK(6) if (!ret.success) config--; + GPK(5) if (!ret.success) config--; + GPK(4) if (!ret.success) config--; + GPK(3) if (!ret.success) config--; + GPK(2) if (!ret.success) config--; + GPK(1) if (!ret.success) config--; + GPK(0) if (!ret.success) config--; + if (config < 0 && !ret.success) { + std::cout << log::log_err + << "no suitable config for SingleDimensionMassTrans.\n"; + exit(-1); + } #undef GPK if (AutoTuner::ProfileKernels) { diff --git a/include/mgard-x/DataRefactoring/SingleDimension/Correction/SolveTridiagKernel.hpp b/include/mgard-x/DataRefactoring/SingleDimension/Correction/SolveTridiagKernel.hpp deleted file mode 100644 index 7ece383ad7..0000000000 --- a/include/mgard-x/DataRefactoring/SingleDimension/Correction/SolveTridiagKernel.hpp +++ /dev/null @@ -1,131 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#ifndef MGARD_X_SINGLE_DIMENSION_SOLVE_TRIDIAG_KERNEL_TEMPLATE -#define MGARD_X_SINGLE_DIMENSION_SOLVE_TRIDIAG_KERNEL_TEMPLATE - -#include "../../../RuntimeX/RuntimeX.h" - -// #include "../../MultiDimension/Correction/LPKFunctor.h" - -namespace mgard_x { - -template -class ForwardPassMultCoefficientFunctor : public Functor { -public: - MGARDX_CONT ForwardPassMultCoefficientFunctor() {} - MGARDX_CONT - ForwardPassMultCoefficientFunctor(SubArray<1, T, DeviceType> am, - SubArray<1, T, DeviceType> bm, - SubArray<1, T, DeviceType> amXbm) - : am(am), bm(bm), (amXbm) { - Functor(); - } - - MGARDX_EXEC void Operation1() { - SIZE id = FunctorBase::GetBlockIdX() * - FunctorBase::GetBlockDimX() + - FunctorBase::GetThreadIdX(); - - if (id < am.getShape(0)) { - *amXbm(id) = (*am(id)) * (*bm(id)); - } - } - - MGARDX_CONT size_t shared_memory_size() { return 0; } - -private: - // functor parameters - SubArray<1, T, DeviceType> am; - SubArray<1, T, DeviceType> bm; - SubArray<1, T, DeviceType> amXbm; -}; - -template -class SingleDimensionSolveTridiag : public AutoTuner { -public: - MGARDX_CONT - SingleDimensionSolveTridiag() : AutoTuner() {} - - template - MGARDX_CONT Task> - GenTask(SubArray<1, T, DeviceType> am, SubArray<1, T, DeviceType> bm, - SubArray<1, T, DeviceType> amXbm, int queue_idx) { - - using FunctorType = ForwardPassMultCoefficientFunctor; - FunctorType functor(am, bm, amXbm); - - SIZE nf = v.getShape(0); - SIZE total_thread_x = nf; - - SIZE tbx, tby, tbz, gridx, gridy, gridz; - size_t sm_size = functor.shared_memory_size(); - - tbz = 1; - tby = 1; - tbx = F; - gridz = 1; - gridy = 1; - gridx = ceil((float)total_thread_x / tbx); - - return Task(functor, gridz, gridy, gridx, tbz, tby, tbx, sm_size, queue_idx, - "ForwardPassMultCoefficient"); - } - - MGARDX_CONT - void Execute(DIM current_dim, SubArray<1, T, DeviceType> dist, - SubArray<1, T, DeviceType> ratio, - SubArray coeff, SubArray v, - SubArray<1, T, DeviceType> am, SubArray<1, T, DeviceType> bm, - int queue_idx) { - - Array<1, T, DeviceType> amXbm(am.getShape(0)); - - int range_l = std::min(6, (int)std::log2(coeff.getShape(0)) - 1); - int arch = DeviceRuntime::GetArchitectureGeneration(); - int prec = TypeToIdx(); - int config = - AutoTuner::autoTuningTable.gpk_reo_nd[prec][range_l]; - - double min_time = std::numeric_limits::max(); - int min_config = 0; - -#define GPK(CONFIG) \ - if (config == CONFIG || AutoTuner::ProfileKernels) { \ - const int F = GPK_CONFIG[D - 1][CONFIG][2]; \ - using FunctorType = ForwardPassMultCoefficient; \ - using TaskType = Task; \ - TaskType task = GenTask(am, bm, SubArray(amXbm), queue_idx); \ - DeviceAdapter adapter; \ - ExecutionReturn ret = adapter.Execute(task); \ - if (AutoTuner::ProfileKernels) { \ - if (min_time > ret.execution_time) { \ - min_time = ret.execution_time; \ - min_config = CONFIG; \ - } \ - } \ - } - - GPK(0) - GPK(1) - GPK(2) - GPK(3) - GPK(4) - GPK(5) - GPK(6) -#undef GPK - - if (AutoTuner::ProfileKernels) { - FillAutoTunerTable("SingleDimensionSolveTridiag", prec, - range_l, min_config); - } - } -}; - -} // namespace mgard_x - -#endif \ No newline at end of file diff --git a/include/mgard-x/DataRefactoring/SingleDimension/DataRefactoring.h b/include/mgard-x/DataRefactoring/SingleDimension/DataRefactoring.h index 79178e71bc..f68b0b9a02 100644 --- a/include/mgard-x/DataRefactoring/SingleDimension/DataRefactoring.h +++ b/include/mgard-x/DataRefactoring/SingleDimension/DataRefactoring.h @@ -8,11 +8,45 @@ #ifndef MGARD_X_SINGLE_DIMENSION_DATA_REFACTORING #define MGARD_X_SINGLE_DIMENSION_DATA_REFACTORING -#include "../../Hierarchy.h" +#include "../../Hierarchy/Hierarchy.h" #include "../../RuntimeX/RuntimeXPublic.h" namespace mgard_x { +static bool singledim_refactoring_store = false; +static bool singledim_refactoring_verify = false; +static bool singledim_refactoring_debug_print = false; + +template +void CalcCoefficients(DIM current_dim, SubArray<1, T, DeviceType> ratio, + SubArray v, + SubArray coarse, + SubArray coeff, int queue_idx); + +template +void CoefficientsRestore(DIM current_dim, SubArray<1, T, DeviceType> ratio, + SubArray v, + SubArray coarse, + SubArray coeff, int queue_idx); + +template +void CalcCorrection(Hierarchy &hierarchy, + SubArray &coeff, + SubArray &correction, SIZE curr_dim, + SIZE l, int queue_idx); + +template +void CopyND(SubArray dinput, + SubArray &doutput, int queue_idx); + +template +void AddND(SubArray dinput, + SubArray &doutput, int queue_idx); + +template +void SubtractND(SubArray dinput, + SubArray &doutput, int queue_idx); + template void decompose_single(Hierarchy &hierarchy, SubArray &v, SIZE l_target, diff --git a/include/mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp b/include/mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp index c86e515178..86ade65620 100644 --- a/include/mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp +++ b/include/mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp @@ -5,75 +5,13 @@ * Date: March 17, 2022 */ -#include "../../Hierarchy.hpp" +#include "../../Hierarchy/Hierarchy.hpp" #include "../../RuntimeX/RuntimeX.h" -#include "../MultiDimension/Correction/IterativeProcessingKernel.hpp" -#include "../MultiDimension/Correction/LevelwiseProcessingKernel.hpp" -#include "Coefficient/CoefficientKernel.hpp" -#include "Correction/MassTransKernel.hpp" +#include "DataRefactoring.h" namespace mgard_x { -template -void calc_correction_single(Hierarchy &hierarchy, - SubArray &coeff, - SubArray &correction, - SIZE curr_dim, SIZE l, int queue_idx) { - - SingleDimensionMassTrans().Execute( - curr_dim, SubArray(hierarchy.dist_array[curr_dim][l]), - SubArray(hierarchy.ratio_array[curr_dim][l]), coeff, correction, - queue_idx); - - if (debug_print) { - PrintSubarray("SingleDimensionMassTrans", correction); - } - - DIM curr_dim_f = 0, curr_dim_c = 1, curr_dim_r = 2; - if (curr_dim == 0) { - correction.project(curr_dim_f, curr_dim_c, curr_dim_r); - Ipk1Reo().Execute( - curr_dim_r, curr_dim_c, curr_dim_f, - SubArray(hierarchy.am_array[curr_dim_f][l + 1]), - SubArray(hierarchy.bm_array[curr_dim_f][l + 1]), correction, queue_idx); - if (debug_print) { - PrintSubarray("Ipk1Reo", correction); - } - - } else if (curr_dim == 1) { - curr_dim_f = 0, curr_dim_c = 1, curr_dim_r = 2; - correction.project(curr_dim_f, curr_dim_c, curr_dim_r); - Ipk2Reo().Execute( - curr_dim_r, curr_dim_c, curr_dim_f, - SubArray(hierarchy.am_array[curr_dim_c][l + 1]), - SubArray(hierarchy.bm_array[curr_dim_c][l + 1]), correction, queue_idx); - if (debug_print) { - PrintSubarray("Ipk2Reo", correction); - } - } else if (curr_dim == 2) { - curr_dim_f = 0, curr_dim_c = 1, curr_dim_r = 2; - correction.project(curr_dim_f, curr_dim_c, curr_dim_r); - Ipk3Reo().Execute( - curr_dim_r, curr_dim_c, curr_dim_f, - SubArray(hierarchy.am_array[curr_dim_r][l + 1]), - SubArray(hierarchy.bm_array[curr_dim_r][l + 1]), correction, queue_idx); - if (debug_print) { - PrintSubarray("Ipk3Reo", correction); - } - } else { - curr_dim_f = 0, curr_dim_c = 1, curr_dim_r = curr_dim; - correction.project(curr_dim_f, curr_dim_c, curr_dim_r); - Ipk3Reo().Execute( - curr_dim_r, curr_dim_c, curr_dim_f, - SubArray(hierarchy.am_array[curr_dim_r][l + 1]), - SubArray(hierarchy.bm_array[curr_dim_r][l + 1]), correction, queue_idx); - if (debug_print) { - PrintSubarray("Ipk3Reo", correction); - } - } -} - template void decompose_single(Hierarchy &hierarchy, SubArray &v, SIZE l_target, @@ -85,13 +23,13 @@ void decompose_single(Hierarchy &hierarchy, Array workspace(workspace_shape); SubArray w(workspace); - if (debug_print) { + if (singledim_refactoring_debug_print) { PrintSubarray("Input", v); } for (int l = 0; l < l_target; ++l) { for (int curr_dim = 0; curr_dim < D; curr_dim++) { - if (debug_print) { + if (singledim_refactoring_debug_print) { std::cout << "l: " << l << " curr_dim: " << curr_dim << "\n"; } std::vector fine_shape(D); @@ -133,28 +71,26 @@ void decompose_single(Hierarchy &hierarchy, SubArray correction = w; correction.resize(coarse_shape); - LwpkReo().Execute(v_fine, w_fine, queue_idx); + CopyND(v_fine, w_fine, queue_idx); - if (debug_print) { + if (singledim_refactoring_debug_print) { PrintSubarray("COPY", w_fine); } - SingleDimensionCoefficient().Execute( - curr_dim, SubArray(hierarchy.ratio_array[curr_dim][l]), w_fine, - coarse, coeff, queue_idx); + CalcCoefficients(curr_dim, SubArray(hierarchy.ratio_array[curr_dim][l]), + w_fine, coarse, coeff, queue_idx); - if (debug_print) { + if (singledim_refactoring_debug_print) { PrintSubarray("SingleDimensionCoefficient - fine", w_fine); PrintSubarray("SingleDimensionCoefficient - corase", coarse); PrintSubarray("SingleDimensionCoefficient - coeff", coeff); } - calc_correction_single(hierarchy, coeff, correction, curr_dim, l, - queue_idx); + CalcCorrection(hierarchy, coeff, correction, curr_dim, l, queue_idx); - LwpkReo().Execute(correction, coarse, queue_idx); + AddND(correction, coarse, queue_idx); - if (debug_print) { + if (singledim_refactoring_debug_print) { PrintSubarray("ADD", coarse); } @@ -173,13 +109,13 @@ void recompose_single(Hierarchy &hierarchy, Array workspace(workspace_shape); SubArray w(workspace); - if (debug_print) { + if (singledim_refactoring_debug_print) { PrintSubarray("Input", v); } for (int l = l_target - 1; l >= 0; --l) { for (int curr_dim = D - 1; curr_dim >= 0; curr_dim--) { - if (debug_print) { + if (singledim_refactoring_debug_print) { std::cout << "l: " << l << " curr_dim: " << curr_dim << "\n"; } std::vector fine_shape(D); @@ -220,29 +156,27 @@ void recompose_single(Hierarchy &hierarchy, SubArray correction = w; correction.resize(coarse_shape); - calc_correction_single(hierarchy, coeff, correction, curr_dim, l, - queue_idx); + CalcCorrection(hierarchy, coeff, correction, curr_dim, l, queue_idx); - LwpkReo().Execute(correction, coarse, - queue_idx); + SubtractND(correction, coarse, queue_idx); - if (debug_print) { + if (singledim_refactoring_debug_print) { PrintSubarray("SUBTRACT", coarse); } - SingleDimensionCoefficient().Execute( - curr_dim, SubArray(hierarchy.ratio_array[curr_dim][l]), w_fine, - coarse, coeff, queue_idx); + CoefficientsRestore(curr_dim, + SubArray(hierarchy.ratio_array[curr_dim][l]), w_fine, + coarse, coeff, queue_idx); - if (debug_print) { + if (singledim_refactoring_debug_print) { PrintSubarray("SingleDimensionCoefficient - fine", w_fine); PrintSubarray("SingleDimensionCoefficient - corase", coarse); PrintSubarray("SingleDimensionCoefficient - coeff", coeff); } - LwpkReo().Execute(w_fine, v_fine, queue_idx); + CopyND(w_fine, v_fine, queue_idx); - if (debug_print) { + if (singledim_refactoring_debug_print) { PrintSubarray("COPY", v_fine); } diff --git a/include/mgard-x/Hierarchy/CMakeLists.txt b/include/mgard-x/Hierarchy/CMakeLists.txt new file mode 100644 index 0000000000..d02867c67b --- /dev/null +++ b/include/mgard-x/Hierarchy/CMakeLists.txt @@ -0,0 +1,5 @@ +list(APPEND MGARD_X_HEADER + ${CMAKE_CURRENT_SOURCE_DIR}/Hierarchy.h + ${CMAKE_CURRENT_SOURCE_DIR}/Hierarchy.hpp + ) +set(MGARD_X_HEADER ${MGARD_X_HEADER} PARENT_SCOPE) \ No newline at end of file diff --git a/include/mgard-x/Hierarchy.h b/include/mgard-x/Hierarchy/Hierarchy.h similarity index 97% rename from include/mgard-x/Hierarchy.h rename to include/mgard-x/Hierarchy/Hierarchy.h index 2553d38f0e..d4a3af8071 100644 --- a/include/mgard-x/Hierarchy.h +++ b/include/mgard-x/Hierarchy/Hierarchy.h @@ -8,8 +8,8 @@ #ifndef MGARD_X_HANDLE #define MGARD_X_HANDLE -#include "RuntimeX/RuntimeXPublic.h" -#include "Types.h" +#include "../RuntimeX/RuntimeXPublic.h" +#include "../Utilities/Types.h" namespace mgard_x { @@ -32,7 +32,7 @@ struct Config { int reorder; Config() { - dev_type = device_type::Auto; + dev_type = device_type::AUTO; dev_id = 0; decomposition = decomposition_type::MultiDim; l_target = -1; // no limit diff --git a/include/mgard-x/Hierarchy.hpp b/include/mgard-x/Hierarchy/Hierarchy.hpp similarity index 99% rename from include/mgard-x/Hierarchy.hpp rename to include/mgard-x/Hierarchy/Hierarchy.hpp index db28f7cb64..0e5635b4de 100644 --- a/include/mgard-x/Hierarchy.hpp +++ b/include/mgard-x/Hierarchy/Hierarchy.hpp @@ -5,8 +5,8 @@ * Date: March 17, 2022 */ +#include "../RuntimeX/RuntimeX.h" #include "Hierarchy.h" -#include "RuntimeX/RuntimeX.h" #include #include diff --git a/include/mgard-x/Linearization/CMakeLists.txt b/include/mgard-x/Linearization/CMakeLists.txt new file mode 100644 index 0000000000..0e6faba3c8 --- /dev/null +++ b/include/mgard-x/Linearization/CMakeLists.txt @@ -0,0 +1,5 @@ +list(APPEND MGARD_X_HEADER + ${CMAKE_CURRENT_SOURCE_DIR}/LevelLinearizer.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/LevelLinearizer2.hpp + ) +set(MGARD_X_HEADER ${MGARD_X_HEADER} PARENT_SCOPE) \ No newline at end of file diff --git a/include/mgard-x/Linearization/LevelLinearizer2.hpp b/include/mgard-x/Linearization/LevelLinearizer2.hpp index a8a7498250..7a77e431f4 100644 --- a/include/mgard-x/Linearization/LevelLinearizer2.hpp +++ b/include/mgard-x/Linearization/LevelLinearizer2.hpp @@ -316,8 +316,8 @@ class LevelLinearizer2 : public AutoTuner { for (int d = 3; d < D; d++) { gridx *= shape.dataHost()[d]; } - return Task(functor, gridz, gridy, gridx, tbz, tby, tbx, sm_size, - queue_idx); + return Task(functor, gridz, gridy, gridx, tbz, tby, tbx, sm_size, queue_idx, + "LevelLinearizer"); } MGARDX_CONT @@ -349,10 +349,10 @@ class LevelLinearizer2 : public AutoTuner { int range_l = std::min(6, (int)std::log2(v.getShape(0)) - 1); int prec = TypeToIdx(); - int config = AutoTuner::autoTuningTable.llk[prec][range_l]; double min_time = std::numeric_limits::max(); int min_config = 0; + ExecutionReturn ret; #define LLK(CONFIG) \ if (config == CONFIG || AutoTuner::ProfileKernels) { \ @@ -365,21 +365,25 @@ class LevelLinearizer2 : public AutoTuner { TaskType task = \ GenTask(shape, l_target, ranges, v, d_level_v, queue_idx); \ DeviceAdapter adapter; \ - ExecutionReturn ret = adapter.Execute(task); \ + ret = adapter.Execute(task); \ if (AutoTuner::ProfileKernels) { \ - if (min_time > ret.execution_time) { \ + if (ret.success && min_time > ret.execution_time) { \ min_time = ret.execution_time; \ min_config = CONFIG; \ } \ } \ } - LLK(0) - LLK(1) - LLK(2) - LLK(3) - LLK(4) - LLK(5) - LLK(6) + LLK(6) if (!ret.success) config--; + LLK(5) if (!ret.success) config--; + LLK(4) if (!ret.success) config--; + LLK(3) if (!ret.success) config--; + LLK(2) if (!ret.success) config--; + LLK(1) if (!ret.success) config--; + LLK(0) if (!ret.success) config--; + if (config < 0 && !ret.success) { + std::cout << log::log_err << "no suitable config for LevelLinearizer.\n"; + exit(-1); + } #undef LLK if (AutoTuner::ProfileKernels) { FillAutoTunerTable("llk", prec, range_l, min_config); diff --git a/include/mgard-x/Lossless/ParallelHuffman/GenerateCL.hpp b/include/mgard-x/Lossless/ParallelHuffman/GenerateCL.hpp index 48ab3ba211..65bd36c41c 100644 --- a/include/mgard-x/Lossless/ParallelHuffman/GenerateCL.hpp +++ b/include/mgard-x/Lossless/ParallelHuffman/GenerateCL.hpp @@ -90,7 +90,6 @@ class GenerateCLFunctor : public HuffmanCLCustomizedFunctor { // printf("LoopCondition1 %d %u %d\n", (*status((IDX)_lNodesCur)), size, // (*status((IDX)_iNodesSize))); return (*status((IDX)_lNodesCur)) < size || (*status((IDX)_iNodesSize)) > 1; - ; } MGARDX_EXEC void Operation2() { @@ -239,7 +238,8 @@ class GenerateCLFunctor : public HuffmanCLCustomizedFunctor { if (*lNodesFreq((IDX)i) <= (*status((IDX)_minFreq))) { threadCurLeavesNum = i - (*status((IDX)_lNodesCur)) + 1; // Atomic max -- Largest valid index - Atomic::Max(status((IDX)_curLeavesNum), threadCurLeavesNum); + Atomic::Max( + status((IDX)_curLeavesNum), threadCurLeavesNum); } if (i - (*status((IDX)_lNodesCur)) < (*status((IDX)_curLeavesNum))) { @@ -725,16 +725,18 @@ class GenerateCL : public AutoTuner { SubArray<1, int, DeviceType> copyIndex, SubArray<1, uint32_t, DeviceType> diagonal_path_intersections, int queue_idx) { - Array<1, int, DeviceType> status({(SIZE)16}, false, true); + Array<1, int, DeviceType> status_array({(SIZE)16}, false, true); + SubArray status(status_array); using FunctorType = GenerateCLFunctor; using TaskType = Task; - TaskType task = GenTask( - histogram, CL, dict_size, lNodesFreq, lNodesLeader, iNodesFreq, - iNodesLeader, tempFreq, tempIsLeaf, tempIndex, copyFreq, copyIsLeaf, - copyIndex, diagonal_path_intersections, SubArray(status), queue_idx); + TaskType task = GenTask(histogram, CL, dict_size, lNodesFreq, lNodesLeader, + iNodesFreq, iNodesLeader, tempFreq, tempIsLeaf, + tempIndex, copyFreq, copyIsLeaf, copyIndex, + diagonal_path_intersections, status, queue_idx); DeviceAdapter adapter; adapter.Execute(task); + DeviceRuntime::SyncAllQueues(); } }; diff --git a/include/mgard-x/Lossless/ParallelHuffman/GenerateCW.hpp b/include/mgard-x/Lossless/ParallelHuffman/GenerateCW.hpp index 3698eb0eb9..57d2906dca 100644 --- a/include/mgard-x/Lossless/ParallelHuffman/GenerateCW.hpp +++ b/include/mgard-x/Lossless/ParallelHuffman/GenerateCW.hpp @@ -95,7 +95,8 @@ class GenerateCWFunctor : public HuffmanCWCustomizedFunctor { FunctorBase::GetThreadIdX(); // (*status((IDX)_CDPI)) update if (i < size - 1 && *CL((IDX)i + 1) > (*status((IDX)_CCL))) { - Atomic::Min(&(*status((IDX)_newCDPI)), (int)i); + Atomic::Min( + &(*status((IDX)_newCDPI)), (int)i); } } @@ -313,6 +314,7 @@ class GenerateCW : public AutoTuner { GenTask(CL, CW, first, entry, dict_size, SubArray(status), queue_idx); DeviceAdapter adapter; adapter.Execute(task); + DeviceRuntime::SyncAllQueues(); } }; diff --git a/include/mgard-x/Lossless/ParallelHuffman/GetFirstNonzeroIndex.hpp b/include/mgard-x/Lossless/ParallelHuffman/GetFirstNonzeroIndex.hpp index e45f6cf215..039e867d2d 100644 --- a/include/mgard-x/Lossless/ParallelHuffman/GetFirstNonzeroIndex.hpp +++ b/include/mgard-x/Lossless/ParallelHuffman/GetFirstNonzeroIndex.hpp @@ -28,7 +28,8 @@ class GetFirstNonzeroIndexFunctor : public Functor { FunctorBase::GetBlockDimX()) + FunctorBase::GetThreadIdX(); if (thread < size && *array(thread) != 0) { - Atomic::Min(result((IDX)0), thread); + Atomic::Min(result((IDX)0), thread); } } diff --git a/include/mgard-x/Lossless/ParallelHuffman/Histogram.hpp b/include/mgard-x/Lossless/ParallelHuffman/Histogram.hpp index 773a422744..7786ebd388 100644 --- a/include/mgard-x/Lossless/ParallelHuffman/Histogram.hpp +++ b/include/mgard-x/Lossless/ParallelHuffman/Histogram.hpp @@ -46,15 +46,24 @@ class HistogramFunctor : public Functor { if (warpid >= warps_block - 1) end = N; - for (unsigned int pos = FunctorBase::GetThreadIdX(); - pos < (bins)*R; pos += FunctorBase::GetBlockDimX()) - Hs[pos] = 0; + if (CACHE_HISTOGRAM) { + for (unsigned int pos = FunctorBase::GetThreadIdX(); + pos < (bins)*R; pos += FunctorBase::GetBlockDimX()) { + Hs[pos] = 0; + } + } } MGARDX_EXEC void Operation2() { for (unsigned int i = begin; i < end; i += step) { int d = *input_data(i); - Atomic::Add(&Hs[off_rep + d], 1); + if (CACHE_HISTOGRAM) { + Atomic::Add( + &Hs[off_rep + d], 1); + } else { + Atomic::Add( + &Hs[off_rep + d], 1); + } } } @@ -65,7 +74,8 @@ class HistogramFunctor : public Functor { for (int base = 0; base < (bins)*R; base += bins) { sum += Hs[base + pos]; } - Atomic::Add(output(pos), (Q)sum); + Atomic::Add( + output(pos), (Q)sum); } } @@ -120,9 +130,10 @@ class Histogram : public AutoTuner { threadsPerBlock = ((((numValues / (numBlocks * itemsPerThread)) + 1) / 64) + 1) * 64; - while (threadsPerBlock > 1024) { + while (threadsPerBlock > + DeviceRuntime::GetMaxNumThreadsPerTB()) { if (RPerBlock <= 1) { - threadsPerBlock = 1024; + threadsPerBlock = DeviceRuntime::GetMaxNumThreadsPerTB(); } else { RPerBlock /= 2; numBlocks *= 2; @@ -218,13 +229,16 @@ class Histogram : public AutoTuner { int threadsPerBlock, numBlocks; Config(len, dict_size, RPerBlock, threadsPerBlock, numBlocks); Array<1, int, DeviceType> local_histogram_array( - {(SIZE)2 * dict_size * numBlocks}); + {(SIZE)RPerBlock * dict_size * numBlocks}, false, true); + local_histogram_array.memset(0); + DeviceRuntime::SyncAllQueues(); local_histogram = SubArray(local_histogram_array); TaskType task = GenTask(input_data, local_histogram, output, len, dict_size, RPerBlock, threadsPerBlock, numBlocks, queue_idx); DeviceAdapter adapter; adapter.Execute(task); + DeviceRuntime::SyncAllQueues(); } } }; diff --git a/include/mgard-x/Quantization/LinearQuantization.hpp b/include/mgard-x/Quantization/LinearQuantization.hpp index 9b66962246..da6ce9d15c 100644 --- a/include/mgard-x/Quantization/LinearQuantization.hpp +++ b/include/mgard-x/Quantization/LinearQuantization.hpp @@ -322,7 +322,8 @@ class LevelwiseLinearQuantizeNDFunctor : public Functor { if (quantized_data >= 0 && quantized_data < dict_size) { // do nothing } else { - LENGTH i = Atomic::Add(outlier_count((IDX)0), (LENGTH)1); + LENGTH i = Atomic::Add(outlier_count((IDX)0), (LENGTH)1); *outlier_idx(i) = get_idx(shape_sm, idx); *outliers(i) = quantized_data; quantized_data = 0; @@ -445,6 +446,7 @@ class LevelwiseLinearQuantizeND : public AutoTuner { int config = AutoTuner::autoTuningTable.lwqzk[prec][range_l]; double min_time = std::numeric_limits::max(); int min_config = 0; + ExecutionReturn ret; #define LWQZK(CONFIG) \ if (config == CONFIG || AutoTuner::ProfileKernels) { \ @@ -458,21 +460,26 @@ class LevelwiseLinearQuantizeND : public AutoTuner { ranges, l_target, quantizers, volumes, s, huff_dict_size, v, work, \ prep_huffman, shape, outlier_count, outlier_idx, outliers, queue_idx); \ DeviceAdapter adapter; \ - ExecutionReturn ret = adapter.Execute(task); \ + ret = adapter.Execute(task); \ if (AutoTuner::ProfileKernels) { \ - if (min_time > ret.execution_time) { \ + if (ret.success && min_time > ret.execution_time) { \ min_time = ret.execution_time; \ min_config = CONFIG; \ } \ } \ } - LWQZK(0) - LWQZK(1) - LWQZK(2) - LWQZK(3) - LWQZK(4) - LWQZK(5) - LWQZK(6) + LWQZK(6) if (!ret.success) config--; + LWQZK(5) if (!ret.success) config--; + LWQZK(4) if (!ret.success) config--; + LWQZK(3) if (!ret.success) config--; + LWQZK(2) if (!ret.success) config--; + LWQZK(1) if (!ret.success) config--; + LWQZK(0) if (!ret.success) config--; + if (config < 0 && !ret.success) { + std::cout << log::log_err + << "no suitable config for LevelwiseLinearQuantizeND.\n"; + exit(-1); + } #undef LWQZK if (AutoTuner::ProfileKernels) { FillAutoTunerTable("lwqzk", prec, range_l, min_config); @@ -963,11 +970,11 @@ class LevelwiseLinearDequantizeND : public AutoTuner { int range_l = std::min(6, (int)std::log2(v.getShape(0)) - 1); int prec = TypeToIdx(); - int config = AutoTuner::autoTuningTable.lwdqzk[prec][range_l]; - double min_time = std::numeric_limits::max(); int min_config = 0; + ExecutionReturn ret; + #define LWDQZK(CONFIG) \ if (config == CONFIG || AutoTuner::ProfileKernels) { \ const int R = LWPK_CONFIG[D - 1][CONFIG][0]; \ @@ -980,21 +987,27 @@ class LevelwiseLinearDequantizeND : public AutoTuner { ranges, l_target, quantizers, volumes, s, huff_dict_size, v, work, \ prep_huffman, shape, outlier_count, outlier_idx, outliers, queue_idx); \ DeviceAdapter adapter; \ - ExecutionReturn ret = adapter.Execute(task); \ + ret = adapter.Execute(task); \ if (AutoTuner::ProfileKernels) { \ - if (min_time > ret.execution_time) { \ + if (ret.success && min_time > ret.execution_time) { \ min_time = ret.execution_time; \ min_config = CONFIG; \ } \ } \ } - LWDQZK(0) - LWDQZK(1) - LWDQZK(2) - LWDQZK(3) - LWDQZK(4) - LWDQZK(5) - LWDQZK(6) + + LWDQZK(6) if (!ret.success) config--; + LWDQZK(5) if (!ret.success) config--; + LWDQZK(4) if (!ret.success) config--; + LWDQZK(3) if (!ret.success) config--; + LWDQZK(2) if (!ret.success) config--; + LWDQZK(1) if (!ret.success) config--; + LWDQZK(0) if (!ret.success) config--; + if (config < 0 && !ret.success) { + std::cout << log::log_err + << "no suitable config for LevelwiseLinearDequantizeND.\n"; + exit(-1); + } #undef LWDQZK if (AutoTuner::ProfileKernels) { FillAutoTunerTable("lwdqzk", prec, range_l, min_config); @@ -1002,783 +1015,6 @@ class LevelwiseLinearDequantizeND : public AutoTuner { } }; -// template -// __global__ void -// _levelwise_linear_quantize(SIZE *shapes, SIZE l_target, T *quantizers, T * -// volumes, SIZE ldvolumes, T *dv, -// SIZE *ldvs, QUANTIZED_INT *dwork, SIZE *ldws, bool -// prep_huffman, SIZE dict_size, SIZE *shape, LENGTH -// *outlier_count, LENGTH *outlier_idx, QUANTIZED_INT -// *outliers) { - -// size_t threadId = (threadIdx.z * (blockDim.x * blockDim.y)) + -// (threadIdx.y * blockDim.x) + threadIdx.x; -// T * smT = SharedMemory(); -// T * quantizers_sm = smT; smT += l_target + 1; - -// T * volumes_0 = smT; if (CALC_VOL) smT += blockDim.x * (l_target + 1); -// T * volumes_1 = smT; if (CALC_VOL) smT += blockDim.y * (l_target + 1); -// T * volumes_2 = smT; if (CALC_VOL) smT += blockDim.z * (l_target + 1); -// T * volumes_3_plus = smT; -// if (CALC_VOL && D > 3) smT += (D-3) * (l_target + 1); - -// SIZE * smInt = (SIZE *)smT; -// SIZE *ldvs_sm = smInt; smInt += D; -// SIZE *ldws_sm = smInt; smInt += D; -// SIZE *shape_sm = smInt; smInt += D; -// SIZE *shapes_sm = smInt; smInt += D * (l_target + 2); - -// if (threadId < l_target + 1) { -// quantizers_sm[threadId] = quantizers[threadId]; -// } -// if (threadId < D) { -// ldvs_sm[threadId] = ldvs[threadId]; -// ldws_sm[threadId] = ldws[threadId]; -// shape_sm[threadId] = shape[threadId]; -// // if (threadId == 0) { -// // printf("%u %u %u %u %u %u\n", shape[0], shape[1], shape[2], ldws[0], -// ldws[1], ldws[2]); -// // } -// } -// if (threadId < D * (l_target + 2)) { -// shapes_sm[threadId] = shapes[threadId]; -// // printf ("D: %d l_target+2: %d load shapes[%llu]: %d\n", D, l_target+2, -// // threadId, shapes_sm[threadId]); -// } - -// __syncthreads(); - -// // determine global idx -// SIZE idx[D]; //thread global idx -// SIZE idx0[D]; //block global idx - -// SIZE firstD = div_roundup(shapes_sm[l_target + 1], F); - -// SIZE bidx = blockIdx.x; -// idx[0] = (bidx % firstD) * F + threadIdx.x; -// idx0[0] = (bidx % firstD) * F; - -// // printf("shapes_sm[l_target+1]: %d firstD %d idx[0] %d\n", -// // shapes_sm[l_target+1], firstD, idx[0]); - -// bidx /= firstD; -// if (D >= 2) { -// idx[1] = blockIdx.y * blockDim.y + threadIdx.y; -// idx0[1] = blockIdx.y * blockDim.y; -// } -// if (D >= 3) { -// idx[2] = blockIdx.z * blockDim.z + threadIdx.z; -// idx0[2] = blockIdx.z * blockDim.z; -// } - -// for (int d = 3; d < D; d++) { -// idx[d] = bidx % shapes_sm[(l_target + 2) * d + l_target + 1]; -// idx0[d] = idx[d]; -// bidx /= shapes_sm[(l_target + 2) * d + l_target + 1]; - -// } - -// if (CALC_VOL) { -// // cache volumes -// for (int l = 0; l < l_target+1; l++) { -// // volumes 0 -// if (threadId < blockDim.x && idx0[0] + threadId < shapes_sm[(l_target + -// 2) * 0 + l_target + 1]) { -// volumes_0[l * blockDim.x + threadId] = -// volumes[(0 * (l_target + 1) + l) * ldvolumes + idx0[0] + threadId]; -// // printf("load %f\n", volumes[(0 * (l_target + 1) + l) * ldvolumes + -// idx0[0] + threadId]); -// } -// if (D >= 2) { -// // volumes 1 -// if (threadId < blockDim.y && idx0[1] + threadId < shapes_sm[(l_target -// + 2) * 1 + l_target + 1]) { -// volumes_1[l * blockDim.y + threadId] = -// volumes[(1 * (l_target + 1) + l) * ldvolumes + idx0[1] + -// threadId]; -// } -// } -// if (D >= 3) { -// // volumes 2 -// if (threadId < blockDim.z && idx0[2] + threadId < shapes_sm[(l_target -// + 2) * 2 + l_target + 1]) { -// volumes_2[l * blockDim.z + threadId] = -// volumes[(2 * (l_target + 1) + l) * ldvolumes + idx0[2] + -// threadId]; -// } -// } -// } - -// if (D >= 4) { -// if (threadId < 1) { -// for (int d = 3; d < D; d++) { -// for (int l = 0; l < l_target+1; l++) { -// volumes_3_plus[(d-3) * (l_target + 1) + l] = -// volumes[(d * (l_target + 1) + l) * ldvolumes + idx[d]]; -// } -// } -// } -// } -// } - -// // if (blockIdx.y == 0 && blockIdx.x == 0 && blockIdx.z == 0 && threadId == -// 0) { -// // printf("volumes_0: "); -// // for (int l = 0; l < l_target+1; l++) { -// // printf("l = %d\n", l); -// // for (int i = 0; i < min(blockDim.x, shapes_sm[(l_target + 2) * 0 + -// l_target + 1]) ; i++) { -// // printf("%f ", volumes_0[l * blockDim.x + i]); -// // } -// // printf("\n"); -// // } -// // printf("\n"); -// // if (D >= 2) { -// // printf("volumes_1: "); -// // for (int l = 0; l < l_target+1; l++) { -// // printf("l = %d\n", l); -// // for (int i = 0; i < min(blockDim.y, shapes_sm[(l_target + 2) * 1 + -// l_target + 1]); i++) { -// // printf("%f ", volumes_1[l * blockDim.y + i]); -// // } -// // printf("\n"); -// // } - -// // printf("\n"); -// // } -// // if (D >= 3) { -// // printf("volumes_2: "); -// // for (int l = 0; l < l_target+1; l++) { -// // printf("l = %d\n", l); -// // for (int i = 0; i < min(blockDim.z, shapes_sm[(l_target + 2) * 2 + -// l_target + 1]); i++) { -// // printf("%f ", volumes_2[l * blockDim.y + i]); -// // } -// // printf("\n"); -// // } -// // } -// // } - -// __syncthreads(); - -// int level = 0; -// for (DIM d = 0; d < D; d++) { -// long long unsigned int l_bit = 0l; -// for (SIZE l = 0; l < l_target + 1; l++) { -// int bit = (idx[d] >= shapes_sm[(l_target + 2) * d + l]) && -// (idx[d] < shapes_sm[(l_target + 2) * d + l + 1]); -// l_bit += bit << l; -// // printf("idx: %d %d d: %d l_bit: %llu\n", idx[1], idx[0], d, l_bit); -// } -// level = max(level, __ffsll(l_bit)); -// } -// level = level - 1; - -// bool in_range = true; -// for (DIM d = 0; d < D; d++) { -// if (idx[d] >= shapes_sm[(l_target + 2) * d + l_target + 1]) -// in_range = false; -// } - -// // printf("idx %llu, level: %d, in_range: %d idx[0]: shape_sm: %d\n", -// // get_idx(shape_sm, idx), level, in_range, shapes_sm[(l_target+2) * 0 + -// // l_target+1]); - -// if (level >= 0 && level <= l_target && in_range) { -// T t = dv[get_idx(ldvs, idx)]; -// T volume = 1; -// if (CALC_VOL) { -// volume *= volumes_0[level * blockDim.x + threadIdx.x]; -// if (D >= 2) { -// volume *= volumes_1[level * blockDim.y + threadIdx.y]; -// } -// if (D >= 3) { -// volume *= volumes_2[level * blockDim.z + threadIdx.z]; -// } -// if (D >= 4) { -// for (int d = 3; d < D; d++) { -// volume *= volumes_3_plus[(d-3) * (l_target + 1) + level]; -// } -// } -// if (sizeof(T) == sizeof(double)) volume = sqrt(volume); -// else if (sizeof(T) == sizeof(float)) volume = sqrtf(volume); -// } -// // printf("l: %d, vol %f(%f*%f*%f), quantizers_sm: %f, quantizers: %f, -// before: %f, quantized: %d\n", level, volume, -// // volumes_0[level * blockDim.x + threadIdx.x], volumes_1[level * -// blockDim.y + threadIdx.y], volumes_2[level * blockDim.z + threadIdx.z], -// // quantizers_sm[level], -// // (quantizers_sm[level] / volume), t, (int)copysign(0.5 + fabs(t /( -// quantizers_sm[level] / volume)), t)); - -// QUANTIZED_INT quantized_data = copysign(0.5 + fabs(t / -// (quantizers_sm[level] * volume) ), t); -// // QUANTIZED_INT quantized_data = copysign(0.5 + fabs(t / -// (quantizers_sm[level] / volume) ), t); -// // printf("dv[%llu] %f quantizers[%d]%f -> dw[%llu]%d \n", -// // get_idx(ldvs, idx), t, -// // level, quantizers_sm[level], -// // get_idx(ldws, idx), quantized_data+dict_size / 2); - -// if (prep_huffman) { -// quantized_data += dict_size / 2; -// if (quantized_data >= 0 && quantized_data < dict_size) { -// // do nothing -// } else { -// LENGTH i = atomicAdd(outlier_count, (LENGTH)1); -// outlier_idx[i] = get_idx(shape_sm, idx); -// outliers[i] = quantized_data; -// quantized_data = 0; -// } -// // if (get_idx(shape_sm, idx) < quant_meta_size_ratio) { -// // size_t i = atomicAdd((unsigned long long int*)outlier_count, -// // (unsigned long long int)1); outlier_idx[i] = get_idx(shape_sm, -// // idx); -// // } -// } - -// dwork[get_idx(ldws_sm, idx)] = quantized_data; -// } -// } - -// template -// void levelwise_linear_quantize_adaptive_launcher( -// Handle &handle, SIZE *shapes, SIZE l_target, T *volumes, -// SIZE ldvolumes, Metadata &m, T *dv, SIZE *ldvs, QUANTIZED_INT *dwork, -// SIZE *ldws, bool prep_huffman, SIZE *shape, LENGTH *outlier_count, -// LENGTH *outlier_idx, QUANTIZED_INT *outliers, int queue_idx) { - -// T *quantizers = new T[l_target + 1]; -// calc_quantizers(handle, quantizers, m, false); -// cudaMemcpyAsyncHelper(handle, handle.quantizers, quantizers, -// sizeof(T) * (l_target + 1), H2D, queue_idx); - -// // printf("norm: %f, tol: %f, s: %f, dict_size: %d\n", m.norm, m.tol, m.s, -// // m.dict_size); -// int total_thread_z = handle.dofs[2][0]; -// int total_thread_y = handle.dofs[1][0]; -// int total_thread_x = handle.dofs[0][0]; -// // linearize other dimensions -// int tbz = R; -// int tby = C; -// int tbx = F; -// int gridz = ceil((float)total_thread_z / tbz); -// int gridy = ceil((float)total_thread_y / tby); -// int gridx = ceil((float)total_thread_x / tbx); -// for (int d = 3; d < D; d++) { -// gridx *= handle.dofs[d][0]; -// } - -// // printf("exec: %d %d %d %d %d %d\n", tbx, tby, tbz, gridx, gridy, gridz); -// dim3 threadsPerBlock(tbx, tby, tbz); -// dim3 blockPerGrid(gridx, gridy, gridz); -// // ldvs + ldws + shape -// size_t sm_size = (D * 3) * sizeof(SIZE); -// // quantizer -// sm_size += (l_target + 1) * sizeof(T); -// // ranges -// sm_size += (l_target + 2) * D * sizeof(SIZE); -// // volumes -// sm_size += tbx * (l_target + 1) * sizeof(T); -// sm_size += tby * (l_target + 1) * sizeof(T); -// sm_size += tbz * (l_target + 1) * sizeof(T); -// if (D > 3) sm_size += (D-3) * (l_target + 1) * sizeof(T); -// // printf("sm_size: %llu\n", sm_size); -// if (m.ntype == norm_type::L_Inf) { -// _levelwise_linear_quantize -// <<>>( -// shapes, l_target, handle.quantizers, volumes, ldvolumes, dv, -// ldvs, dwork, ldws, prep_huffman, m.huff_dict_size, shape, -// outlier_count, outlier_idx, outliers); -// } else if (m.ntype == norm_type::L_2) { -// _levelwise_linear_quantize -// <<>>( -// shapes, l_target, handle.quantizers, volumes, ldvolumes, dv, -// ldvs, dwork, ldws, prep_huffman, m.huff_dict_size, shape, -// outlier_count, outlier_idx, outliers); -// } else { -// std::cout << log::log_err << "unsupported norm type!\n"; -// exit(-1); -// } - -// gpuErrchk(cudaGetLastError()); -// if (handle.sync_and_check_all_kernels) { -// gpuErrchk(cudaDeviceSynchronize()); -// } -// } - -// template -// void levelwise_linear_quantize(Handle &handle, SIZE *shapes, -// SIZE l_target, T *volumes, SIZE ldvolumes, -// Metadata &m, T *dv, SIZE *ldvs, -// QUANTIZED_INT *dwork, SIZE *ldws, -// bool prep_huffman, SIZE *shape, -// LENGTH *outlier_count, LENGTH *outlier_idx, -// QUANTIZED_INT *outliers, int queue_idx) { -// #define QUANTIZE(R, C, F) \ -// { \ -// levelwise_linear_quantize_adaptive_launcher( \ -// handle, shapes, l_target, volumes, ldvolumes, m, dv, ldvs, dwork, -// ldws, prep_huffman, \ -// shape, outlier_count, outlier_idx, outliers, queue_idx); \ -// } - -// if (D >= 3) { -// QUANTIZE(4, 4, 16) -// } -// if (D == 2) { -// QUANTIZE(1, 4, 32) -// } -// if (D == 1) { -// QUANTIZE(1, 1, 64) -// } -// #undef QUANTIZE -// } - -// template -// __global__ void -// _levelwise_linear_dequantize(SIZE *shapes, SIZE l_target, T *quantizers, T * -// volumes, SIZE ldvolumes, QUANTIZED_INT *dv, -// SIZE *ldvs, T *dwork, SIZE *ldws, bool -// prep_huffman, SIZE dict_size, LENGTH -// outlier_count, LENGTH *outlier_idx, -// QUANTIZED_INT *outliers) { - -// LENGTH threadId = (threadIdx.z * (blockDim.x * blockDim.y)) + -// (threadIdx.y * blockDim.x) + threadIdx.x; -// LENGTH blockId = (blockIdx.z * (gridDim.x * gridDim.y)) + -// (blockIdx.y * gridDim.x) + blockIdx.x; -// LENGTH gloablId = blockId * blockDim.x * blockDim.y * blockDim.z + -// threadId; - -// T * smT = SharedMemory(); -// T * quantizers_sm = smT; smT += l_target + 1; -// T * volumes_0 = smT; if (CALC_VOL) smT += blockDim.x * (l_target + 1); -// T * volumes_1 = smT; if (CALC_VOL) smT += blockDim.y * (l_target + 1); -// T * volumes_2 = smT; if (CALC_VOL) smT += blockDim.z * (l_target + 1); -// T * volumes_3_plus = smT; -// if (CALC_VOL && D > 3) smT += (D-3) * (l_target + 1); - -// SIZE * smInt = (SIZE *)smT; -// SIZE *ldvs_sm = smInt; smInt += D; -// SIZE *ldws_sm = smInt; smInt += D; -// SIZE *shape_sm = smInt; smInt += D; -// SIZE *shapes_sm = smInt; smInt += D * (l_target + 2); - -// if (threadId < l_target + 1) { -// quantizers_sm[threadId] = quantizers[threadId]; -// } -// if (threadId < D) { -// ldvs_sm[threadId] = ldvs[threadId]; -// ldws_sm[threadId] = ldws[threadId]; -// } -// if (threadId < D * (l_target + 2)) { -// shapes_sm[threadId] = shapes[threadId]; -// } - -// __syncthreads(); - -// // bool debug = false; -// // if (blockIdx.x == 0 && blockIdx.y == 0 && blockIdx.z == 0 && -// // threadIdx.x == 0 && threadIdx.y == 0 && threadIdx.z == 0) { -// // debug = true; -// // for (int d = 0; d < D; d ++) { -// // printf("shapes_sm[%d]\n", d); -// // for (int l = 0; l < l_target + 1; l++) { -// // printf("%d ", shapes_sm[(l_target+1) * d + l]); -// // } -// // printf("\n"); -// // } -// // } -// // __syncthreads(); - -// // determine global idx -// SIZE idx[D]; //thread global idx -// SIZE idx0[D]; //block global idx - -// SIZE firstD = div_roundup(shapes_sm[l_target + 1], F); - -// SIZE bidx = blockIdx.x; -// idx[0] = (bidx % firstD) * F + threadIdx.x; -// idx0[0] = (bidx % firstD) * F; - -// // printf("shapes_sm[l_target+1]: %d firstD %d idx[0] %d\n", -// // shapes_sm[l_target+1], firstD, idx[0]); - -// bidx /= firstD; -// if (D >= 2) { -// idx[1] = blockIdx.y * blockDim.y + threadIdx.y; -// idx0[1] = blockIdx.y * blockDim.y; -// } -// if (D >= 3) { -// idx[2] = blockIdx.z * blockDim.z + threadIdx.z; -// idx0[2] = blockIdx.z * blockDim.z; -// } - -// for (DIM d = 3; d < D; d++) { -// idx[d] = bidx % shapes_sm[(l_target + 2) * d + l_target + 1]; -// idx0[d] = idx[d]; -// bidx /= shapes_sm[(l_target + 2) * d + l_target + 1]; - -// } - -// if (CALC_VOL) { -// // cache volumes -// for (SIZE l = 0; l < l_target+1; l++) { -// // volumes 0 -// if (threadId < blockDim.x && idx0[0] + threadId < shapes_sm[(l_target + -// 2) * 0 + l_target + 1]) { -// // printf("%d < %d[%d, %d, %d]\n", idx0[0] + (int)threadId, -// // shapes_sm[(l_target + 2) * 0 + l_target + 1], -// // l_target, (l_target + 2) * 0 + l_target + 1, l_target + 2); -// volumes_0[l * blockDim.x + threadId] = -// volumes[(0 * (l_target + 1) + l) * ldvolumes + idx0[0] + threadId]; -// // printf("load %f\n", volumes_0[l * blockDim.x + threadId]); -// } -// if (D >= 2) { -// // volumes 1 -// if (threadId < blockDim.y && idx0[1] + threadId < shapes_sm[(l_target -// + 2) * 1 + l_target + 1]) { -// volumes_1[l * blockDim.y + threadId] = -// volumes[(1 * (l_target + 1) + l) * ldvolumes + idx0[1] + -// threadId]; -// } -// } -// if (D >= 3) { -// // volumes 2 -// if (threadId < blockDim.z && idx0[2] + threadId < shapes_sm[(l_target -// + 2) * 2 + l_target + 1]) { -// volumes_2[l * blockDim.z + threadId] = -// volumes[(2 * (l_target + 1) + l) * ldvolumes + idx0[2] + -// threadId]; -// } -// } -// } - -// if (D >= 4) { -// if (threadId < 1) { -// for (DIM d = 3; d < D; d++) { -// for (SIZE l = 0; l < l_target+1; l++) { -// volumes_3_plus[(d-3) * (l_target + 1) + l] = -// volumes[(d * (l_target + 1) + l) * ldvolumes + idx[d]]; -// } -// } -// } -// } -// } - -// // if (blockIdx.y == 0 && blockIdx.x == 0 && threadId == 0) { -// // printf("volumes_0: "); -// // for (int l = 0; l < l_target+1; l++) { -// // printf("l = %d\n", l); -// // for (int i = 0; i < min(blockDim.x, shapes_sm[(l_target + 2) * 0 + -// l_target + 1]) ; i++) { -// // printf("%f ", volumes_0[l * blockDim.x + i]); -// // } -// // printf("\n"); -// // } -// // printf("\n"); -// // printf("volumes_1: "); -// // for (int l = 0; l < l_target+1; l++) { -// // printf("l = %d\n", l); -// // for (int i = 0; i < min(blockDim.y, shapes_sm[(l_target + 2) * 1 + -// l_target + 1]); i++) { -// // printf("%f ", volumes_1[l * blockDim.y + i]); -// // } -// // printf("\n"); -// // } - -// // } - -// __syncthreads(); - -// int level = 0; -// for (DIM d = 0; d < D; d++) { -// long long unsigned int l_bit = 0l; -// for (SIZE l = 0; l < l_target + 1; l++) { -// int bit = (idx[d] >= shapes_sm[(l_target + 2) * d + l]) && -// (idx[d] < shapes_sm[(l_target + 2) * d + l + 1]); -// l_bit += bit << l; -// // printf("idx: %d %d d: %d l_bit: %llu\n", idx[1], idx[0], d, l_bit); -// } -// level = max(level, __ffsll(l_bit)); -// } - -// bool in_range = true; -// for (DIM d = 0; d < D; d++) { -// if (idx[d] >= shapes_sm[(l_target + 2) * d + l_target + 1]) -// in_range = false; -// } - -// level = level - 1; -// if (level >= 0 && level <= l_target && in_range) { -// // printf("%d %d %d %d\n", idx[3], idx[2], idx[1], idx[0]); -// // printf("idx: %d %d l: %d\n", idx[1], idx[0], level); -// QUANTIZED_INT quantized_data = dv[get_idx(ldvs, idx)]; -// T volume = 1; -// if (CALC_VOL) { -// volume *= volumes_0[level * blockDim.x + threadIdx.x]; -// if (D >= 2) volume *= volumes_1[level * blockDim.y + threadIdx.y]; -// if (D >= 3) volume *= volumes_2[level * blockDim.z + threadIdx.z]; -// if (D >= 4) { -// for (int d = 3; d < D; d++) { -// volume *= volumes_3_plus[(d-3) * (l_target + 1) + level]; -// } -// } -// if (sizeof(T) == sizeof(double)) volume = sqrt(volume); -// else if (sizeof(T) == sizeof(float)) volume = sqrtf(volume); -// } - -// if (prep_huffman) { -// quantized_data -= dict_size / 2; -// } - -// // printf("%d %d %d %d %d %d vol %f (%f * %f * %f), dequantizers: %f, -// before: %d, dequantized: %f\n", blockIdx.z, blockIdx.y, blockIdx.x, -// threadIdx.z, threadIdx.y, threadIdx.x, volume, -// // volumes_0[level * blockDim.x + threadIdx.x], volumes_1[level * -// blockDim.y + threadIdx.y], volumes_2[level * blockDim.z + threadIdx.z], -// // quantizers_sm[level] / volume, quantized_data, (quantizers_sm[level] -// / volume) * (T)quantized_data); dwork[get_idx(ldws, idx)] = -// (quantizers_sm[level] * volume) * (T)quantized_data; -// // dwork[get_idx(ldws, idx)] = (quantizers_sm[level] / volume) * -// (T)quantized_data; -// // dwork[get_idx(ldws, idx)] = (T)dv[get_idx(ldvs, idx)]; - -// // printf("dw[%llu] %d dequantizers[%d]%f -> dw[%llu]%f \n", -// // get_idx(ldvs, idx), -// // quantized_data, level, quantizers_sm[level], get_idx(ldws, -// idx), -// // quantizers_sm[level] * (T)quantized_data); -// } - -// // //outliers -// // if (gloablId < outlier_count) { -// // size_t linerized_idx = outlier_idx[gloablId]; -// // for (int d = 0; d < D; d++) { -// // idx[d] = linerized_idx % shapes_sm[(l_target+2) * d+l_target+1]; -// // linerized_idx /= shapes_sm[(l_target+2) * d+l_target+1]; -// // } -// // int outliter = outliers[gloablId]; -// // outliter -= dict_size / 2; - -// // level = 0; -// // for (int d = 0; d < D; d++) { -// // long long unsigned int l_bit = 0l; -// // for (int l = 0; l < l_target+1; l++) { -// // int bit = (idx[d] >= shapes_sm[(l_target+2) * d + l]) && (idx[d] < -// // shapes_sm[(l_target+2) * d + l+1]); l_bit += bit << l; -// // // printf("idx: %d %d d: %d l_bit: %llu\n", idx[1], idx[0], d, -// // l_bit); -// // } -// // level = max(level, __ffsll(l_bit)); -// // } -// // level = level - 1; - -// // dwork[get_idx(ldws, idx)] = quantizers_sm[level] * (T)outliter; - -// // // printf("outliter: dw[%llu] %d dequantizers[%d]%f -> dw[%llu]%f \n", -// // get_idx(ldvs, idx), -// // // outliter, level, quantizers_sm[level], get_idx(ldws, idx), -// // quantizers_sm[level] * (T)outliter); - -// // } -// } - -// template -// __global__ void _levelwise_linear_dequantize_outliers( -// SIZE *shapes, SIZE l_target, T *quantizers, T * volumes, SIZE ldvolumes, -// QUANTIZED_INT *dv, SIZE *ldvs, T *dwork, SIZE *ldws, SIZE dict_size, -// LENGTH outlier_count, LENGTH *outlier_idx, QUANTIZED_INT *outliers) { - -// size_t threadId = (threadIdx.z * (blockDim.x * blockDim.y)) + -// (threadIdx.y * blockDim.x) + threadIdx.x; -// size_t blockId = (blockIdx.z * (gridDim.x * gridDim.y)) + -// (blockIdx.y * gridDim.x) + blockIdx.x; -// size_t gloablId = blockId * blockDim.x * blockDim.y * blockDim.z + -// threadId; - -// T *sm = SharedMemory(); -// T *quantizers_sm = sm; sm += l_target + 1; - -// SIZE *sm_size = (SIZE*)sm; -// SIZE *ldvs_sm = sm_size; sm_size += D; -// SIZE *ldws_sm = sm_size; sm_size += D; -// SIZE *shapes_sm = sm_size; sm_size += D * (l_target + 2); - -// if (threadId < l_target + 1) { -// quantizers_sm[threadId] = quantizers[threadId]; -// } -// if (threadId < D) { -// ldvs_sm[threadId] = ldvs[threadId]; -// ldws_sm[threadId] = ldws[threadId]; -// } -// if (threadId < D * (l_target + 2)) { -// shapes_sm[threadId] = shapes[threadId]; -// } - -// __syncthreads(); -// SIZE idx[D]; //thread global idx - -// // outliers -// if (gloablId < outlier_count) { -// size_t linerized_idx = outlier_idx[gloablId]; -// // for (DIM d = 0; d < D; d++) { -// // idx[d] = linerized_idx % shapes_sm[(l_target + 2) * d + l_target + -// 1]; -// // linerized_idx /= shapes_sm[(l_target + 2) * d + l_target + 1]; -// // } -// QUANTIZED_INT outliter = outliers[gloablId]; - -// dv[linerized_idx] = outliter; -// // printf("put back[%llu] <- outlier[%llu]: %llu\n", linerized_idx, -// gloablId, outliter); - -// // outliter -= dict_size / 2; - -// // int level = 0; -// // for (DIM d = 0; d < D; d++) { -// // long long unsigned int l_bit = 0l; -// // for (SIZE l = 0; l < l_target + 1; l++) { -// // int bit = (idx[d] >= shapes_sm[(l_target + 2) * d + l]) && -// // (idx[d] < shapes_sm[(l_target + 2) * d + l + 1]); -// // l_bit += bit << l; -// // // printf("idx: %d %d d: %d l_bit: %llu\n", idx[1], idx[0], d, -// l_bit); -// // } -// // level = max(level, __ffsll(l_bit)); -// // } -// // level = level - 1; - -// // T volume = 1; - -// // if (CALC_VOL) { -// // for (DIM d = 0; d < D; d++) { -// // volume *= volumes[(d * (l_target+1) + level) * ldvolumes + -// idx[d]]; -// // } -// // if (sizeof(T) == sizeof(double)) volume = sqrt(volume); -// // else if (sizeof(T) == sizeof(float)) volume = sqrtf(volume); -// // } -// // dwork[get_idx(ldws, idx)] = (quantizers_sm[level] * volume) * -// (T)outliter; -// } -// } - -// template -// void levelwise_linear_dequantize_adaptive_launcher( -// Handle &handle, SIZE *shapes, SIZE l_target, T *volumes, -// SIZE ldvolumes, Metadata &m, QUANTIZED_INT *dv, SIZE *ldvs, T *dwork, -// SIZE *ldws, bool prep_huffman, LENGTH outlier_count, LENGTH *outlier_idx, -// QUANTIZED_INT *outliers, int queue_idx) { - -// // printf("norm: %f, tol: %f, s: %f, dict_size: %d\n", m.norm, m.tol, m.s, -// // m.dict_size); - -// T *quantizers = new T[l_target + 1]; -// calc_quantizers(handle, quantizers, m, false); -// cudaMemcpyAsyncHelper(handle, handle.quantizers, quantizers, -// sizeof(T) * (l_target + 1), H2D, queue_idx); - -// SIZE total_thread_z = handle.dofs[2][0]; -// SIZE total_thread_y = handle.dofs[1][0]; -// SIZE total_thread_x = handle.dofs[0][0]; -// // linearize other dimensions -// SIZE tbz = R; -// SIZE tby = C; -// SIZE tbx = F; -// SIZE gridz = ceil((float)total_thread_z / tbz); -// SIZE gridy = ceil((float)total_thread_y / tby); -// SIZE gridx = ceil((float)total_thread_x / tbx); -// for (DIM d = 3; d < D; d++) { -// gridx *= handle.dofs[d][0]; -// } - -// // printf("exec: %d %d %d %d %d %d\n", tbx, tby, tbz, gridx, gridy, gridz); -// dim3 threadsPerBlock(tbx, tby, tbz); -// dim3 blockPerGrid(gridx, gridy, gridz); -// size_t sm_size = (D * 3) * sizeof(SIZE); -// sm_size += (l_target + 1) * sizeof(T); -// sm_size += (l_target + 2) * D * sizeof(SIZE); -// sm_size += tbx * (l_target + 1) * sizeof(T); -// sm_size += tby * (l_target + 1) * sizeof(T); -// sm_size += tbz * (l_target + 1) * sizeof(T); -// if (D > 3) sm_size += (D-3) * (l_target + 1) * sizeof(T); - -// if (m.ntype == norm_type::L_Inf) { -// if (prep_huffman) { -// _levelwise_linear_dequantize_outliers -// <<>>( -// shapes, l_target, handle.quantizers, volumes, ldvolumes, dv, -// ldvs, dwork, ldws, m.huff_dict_size, outlier_count, -// outlier_idx, outliers); -// } -// gpuErrchk(cudaDeviceSynchronize()); -// _levelwise_linear_dequantize -// <<>>( -// shapes, l_target, handle.quantizers, volumes, ldvolumes, dv, -// ldvs, dwork, ldws, prep_huffman, m.huff_dict_size, outlier_count, -// outlier_idx, outliers); -// gpuErrchk(cudaDeviceSynchronize()); -// } else if (m.ntype == norm_type::L_2){ -// if (prep_huffman) { -// _levelwise_linear_dequantize_outliers -// <<>>( -// shapes, l_target, handle.quantizers, volumes, ldvolumes, dv, -// ldvs, dwork, ldws, m.huff_dict_size, outlier_count, -// outlier_idx, outliers); -// } -// gpuErrchk(cudaDeviceSynchronize()); -// _levelwise_linear_dequantize -// <<>>( -// shapes, l_target, handle.quantizers, volumes, ldvolumes, dv, -// ldvs, dwork, ldws, prep_huffman, m.huff_dict_size, outlier_count, -// outlier_idx, outliers); -// gpuErrchk(cudaDeviceSynchronize()); -// } else { -// std::cout << log::log_err << "unsupported norm type!\n"; -// exit(-1); -// } -// gpuErrchk(cudaGetLastError()); -// if (handle.sync_and_check_all_kernels) { -// gpuErrchk(cudaDeviceSynchronize()); -// } -// } - -// template -// void levelwise_linear_dequantize(Handle &handle, SIZE *shapes, -// SIZE l_target, T *volumes, SIZE ldvolumes, -// Metadata &m, QUANTIZED_INT *dv, SIZE *ldvs, -// T *dwork, SIZE *ldws, bool prep_huffman, -// LENGTH outlier_count, LENGTH *outlier_idx, -// QUANTIZED_INT *outliers, int queue_idx) { -// #define DEQUANTIZE(R, C, F) \ -// { \ -// levelwise_linear_dequantize_adaptive_launcher( \ -// handle, shapes, l_target, volumes, ldvolumes, m, dv, ldvs, dwork, -// ldws, prep_huffman, outlier_count, \ -// outlier_idx, outliers, queue_idx); \ -// } - -// if (D >= 3) { -// DEQUANTIZE(4, 4, 16) -// } -// if (D == 2) { -// DEQUANTIZE(1, 4, 32) -// } -// if (D == 1) { -// DEQUANTIZE(1, 1, 64) -// } - -// #undef DEQUANTIZE -// } - } // namespace mgard_x #endif \ No newline at end of file diff --git a/include/mgard-x/RuntimeX/AutoTuners/AutoTuner.h b/include/mgard-x/RuntimeX/AutoTuners/AutoTuner.h index 9afa942650..d7bfae1485 100644 --- a/include/mgard-x/RuntimeX/AutoTuners/AutoTuner.h +++ b/include/mgard-x/RuntimeX/AutoTuners/AutoTuner.h @@ -192,12 +192,17 @@ MGARDX_CONT void FillAutoTunerTable(std::string kernel_name, int precision_idx, int range_l, int config) { std::string device_type_string = ""; - if (std::is_same::value) { + if (std::is_same::value) { device_type_string = "Serial"; } else if (std::is_same::value) { device_type_string = "Cuda"; } else if (std::is_same::value) { device_type_string = "Hip"; + } else if (std::is_same::value) { + device_type_string = "Sycl"; + } else { + std::cout << log::log_err << "invalid device_type in FillAutoTunerTable.\n"; + exit(-1); } string curr_file_path = __FILE__; @@ -282,11 +287,21 @@ template class AutoTuner { static AutoTuningTable autoTuningTable; static bool ProfileKenrles; }; + +template void BeginAutoTuning() { + AutoTuner::ProfileKernels = true; +} + +template void EndAutoTuning() { + AutoTuner::ProfileKernels = false; +} + } // namespace mgard_x #include "AutoTunerCuda.h" #include "AutoTunerHip.h" #include "AutoTunerKokkos.h" #include "AutoTunerSerial.h" +#include "AutoTunerSycl.h" #endif \ No newline at end of file diff --git a/include/mgard-x/RuntimeX/AutoTuners/AutoTunerSerial.h b/include/mgard-x/RuntimeX/AutoTuners/AutoTunerSerial.h index c82b0e814e..c59f8835eb 100644 --- a/include/mgard-x/RuntimeX/AutoTuners/AutoTunerSerial.h +++ b/include/mgard-x/RuntimeX/AutoTuners/AutoTunerSerial.h @@ -10,7 +10,7 @@ namespace mgard_x { -template <> class AutoTuningTable { +template <> class AutoTuningTable { public: static const int num_precision = 2; static const int num_range = 9; @@ -56,11 +56,11 @@ template <> class AutoTuningTable { static int llk[num_precision][num_range]; }; -template <> class AutoTuner { +template <> class AutoTuner { public: MGARDX_CONT AutoTuner(){}; - static AutoTuningTable autoTuningTable; + static AutoTuningTable autoTuningTable; static bool ProfileKernels; }; diff --git a/include/mgard-x/RuntimeX/AutoTuners/AutoTunerSycl.h b/include/mgard-x/RuntimeX/AutoTuners/AutoTunerSycl.h new file mode 100644 index 0000000000..28c23e8562 --- /dev/null +++ b/include/mgard-x/RuntimeX/AutoTuners/AutoTunerSycl.h @@ -0,0 +1,69 @@ +/* + * Copyright 2022, Oak Ridge National Laboratory. + * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs + * Author: Jieyang Chen (chenj3@ornl.gov) + * Date: March 17, 2022 + */ + +#ifndef MGARD_X_AUTOTUNER_SYCL_H +#define MGARD_X_AUTOTUNER_SYCL_H + +namespace mgard_x { + +template <> class AutoTuningTable { +public: + static const int num_precision = 2; + static const int num_range = 9; + + static int gpk_reo_3d[num_precision][num_range]; + + static int gpk_rev_3d[num_precision][num_range]; + + static int gpk_reo_nd[num_precision][num_range]; + + static int gpk_rev_nd[num_precision][num_range]; + + static int lpk1_3d[num_precision][num_range]; + + static int lpk2_3d[num_precision][num_range]; + + static int lpk3_3d[num_precision][num_range]; + + static int lpk1_nd[num_precision][num_range]; + + static int lpk2_nd[num_precision][num_range]; + + static int lpk3_nd[num_precision][num_range]; + + static int ipk1_3d[num_precision][num_range]; + + static int ipk2_3d[num_precision][num_range]; + + static int ipk3_3d[num_precision][num_range]; + + static int ipk1_nd[num_precision][num_range]; + + static int ipk2_nd[num_precision][num_range]; + + static int ipk3_nd[num_precision][num_range]; + + static int lwpk[num_precision][num_range]; + + static int lwqzk[num_precision][num_range]; + + static int lwdqzk[num_precision][num_range]; + + static int llk[num_precision][num_range]; +}; + +template <> class AutoTuner { +public: + MGARDX_CONT + AutoTuner(){}; + static AutoTuningTable autoTuningTable; + static bool ProfileKernels; +}; + +} // namespace mgard_x + +#endif \ No newline at end of file diff --git a/include/mgard-x/RuntimeX/AutoTuners/CMakeLists.txt b/include/mgard-x/RuntimeX/AutoTuners/CMakeLists.txt index be80fce9c1..507bd33418 100644 --- a/include/mgard-x/RuntimeX/AutoTuners/CMakeLists.txt +++ b/include/mgard-x/RuntimeX/AutoTuners/CMakeLists.txt @@ -3,5 +3,6 @@ list(APPEND MGARD_X_HEADER ${CMAKE_CURRENT_SOURCE_DIR}/AutoTunerSerial.h ${CMAKE_CURRENT_SOURCE_DIR}/AutoTunerCuda.h ${CMAKE_CURRENT_SOURCE_DIR}/AutoTunerHip.h + ${CMAKE_CURRENT_SOURCE_DIR}/AutoTunerSycl.h ) set(MGARD_X_HEADER ${MGARD_X_HEADER} PARENT_SCOPE) \ No newline at end of file diff --git a/include/mgard-x/RuntimeX/DataStructures/SubArray.hpp b/include/mgard-x/RuntimeX/DataStructures/SubArray.hpp index 07d33f78d0..5003a245bf 100644 --- a/include/mgard-x/RuntimeX/DataStructures/SubArray.hpp +++ b/include/mgard-x/RuntimeX/DataStructures/SubArray.hpp @@ -17,14 +17,12 @@ template class SubArray { public: MGARDX_CONT_EXEC SubArray(); + MGARDX_CONT SubArray(Array &array, bool get_host_pointer = false); + MGARDX_CONT SubArray(std::vector shape, T *dv); - MGARDX_CONT_EXEC - SubArray(SubArray &subArray); - MGARDX_CONT_EXEC - SubArray(const SubArray &subArray); MGARDX_CONT_EXEC T *data() { return this->dv; } @@ -93,18 +91,12 @@ template class SubArray { MGARDX_CONT_EXEC void setPitched(bool pitched) { this->pitched = pitched; } - // MGARDX_CONT_EXEC - // SIZE * getLdd() { return this->ldvs_d; } - MGARDX_CONT_EXEC SIZE getLddv1() const { return this->lddv1; } MGARDX_CONT_EXEC SIZE getLddv2() const { return this->lddv2; } - MGARDX_CONT_EXEC - SubArray & - operator=(const SubArray &subArray); void offset(std::vector idx); MGARDX_CONT @@ -136,6 +128,11 @@ template class SubArray { return this->dv + offset; } + MGARDX_CONT_EXEC + T *operator()(IDX l, IDX z, IDX y, IDX x) { + return this->dv + this->_ldvs[2] * this->_ldvs[1] * this->_ldvs[0] * l + + this->_ldvs[1] * this->_ldvs[0] * z + this->_ldvs[0] * y + x; + } MGARDX_CONT_EXEC T *operator()(IDX z, IDX y, IDX x) { return this->dv + this->lddv2 * this->lddv1 * z + this->lddv1 * y + x; @@ -174,21 +171,15 @@ template class SubArray { MGARDX_CONT_EXEC bool isNull() { return this->dv == NULL; } - MGARDX_CONT_EXEC - ~SubArray(); - using DataType = T; using DevType = DeviceType; static const DIM NumDims = D; private: - // std::vector shape; - T *dv; // device pointer - T *v; // host pointer + T *dv = NULL; // device pointer + T *v = NULL; // host pointer bool has_host_pointer = false; - // std::vector ldvs_h; - // SIZE *ldvs_d; SIZE _ldvs[D]; SIZE _shape[D]; @@ -224,10 +215,7 @@ MGARDX_CONT_EXEC SubArray::SubArray() { template MGARDX_CONT SubArray::SubArray(Array &array, bool get_host_pointer) { - // this->shape = array.getShape(); this->dv = array.data(); - // this->ldvs_h = array.ld(); - // this->ldvs_d = array.get_ldvs_d(); for (DIM d = 0; d < D; d++) { this->_shape[d] = array.shape()[d]; @@ -245,9 +233,7 @@ MGARDX_CONT SubArray::SubArray(Array &array, template MGARDX_CONT SubArray::SubArray(std::vector shape, T *dv) { - // this->shape = shape; this->dv = dv; - // this->ldvs_h = shape; for (DIM d = 0; d < D; d++) { this->_shape[d] = shape[d]; @@ -258,89 +244,6 @@ MGARDX_CONT SubArray::SubArray(std::vector shape, this->lddv2 = this->_ldvs[1]; } -template -MGARDX_CONT_EXEC -SubArray::SubArray(SubArray &subArray) { - // this->shape = subArray.shape; - this->dv = subArray.dv; - // this->ldvs_h = subArray.ldvs_h; - // this->ldvs_d = subArray.ldvs_d; - - for (DIM d = 0; d < D; d++) { - this->_shape[d] = subArray.getShape(d); - this->_ldvs[d] = subArray._ldvs[d]; - } - - this->lddv1 = subArray.lddv1; - this->lddv2 = subArray.lddv2; - - this->projected_dim0 = subArray.projected_dim0; - this->projected_dim1 = subArray.projected_dim1; - this->projected_dim2 = subArray.projected_dim2; - - if (subArray.has_host_pointer) { - this->has_host_pointer = true; - this->v = subArray.v; - } - - this->pitched = subArray.pitched; -} - -template -MGARDX_CONT_EXEC SubArray::SubArray( - const SubArray &subArray) { - // this->shape = subArray.shape; - this->dv = subArray.dv; - // this->ldvs_h = subArray.ldvs_h; - // this->ldvs_d = subArray.ldvs_d; - - for (DIM d = 0; d < D; d++) { - this->_shape[d] = subArray._shape[d]; - this->_ldvs[d] = subArray._ldvs[d]; - } - - this->lddv1 = subArray.lddv1; - this->lddv2 = subArray.lddv2; - - this->projected_dim0 = subArray.projected_dim0; - this->projected_dim1 = subArray.projected_dim1; - this->projected_dim2 = subArray.projected_dim2; - - if (subArray.has_host_pointer) { - this->has_host_pointer = true; - this->v = subArray.v; - } - this->pitched = subArray.pitched; -} - -template -MGARDX_CONT_EXEC SubArray &SubArray:: -operator=(const SubArray &subArray) { - // this->shape = subArray.shape; - this->dv = subArray.dv; - // this->ldvs_h = subArray.ldvs_h; - // this->ldvs_d = subArray.ldvs_d; - - for (DIM d = 0; d < D; d++) { - this->_shape[d] = subArray._shape[d]; - this->_ldvs[d] = subArray._ldvs[d]; - } - - this->lddv1 = subArray.lddv1; - this->lddv2 = subArray.lddv2; - - this->projected_dim0 = subArray.projected_dim0; - this->projected_dim1 = subArray.projected_dim1; - this->projected_dim2 = subArray.projected_dim2; - - if (subArray.has_host_pointer) { - this->has_host_pointer = true; - this->v = subArray.v; - } - this->pitched = subArray.pitched; - return *this; -} - template MGARDX_CONT SubArray<1, T, DeviceType> SubArray::Linearize() { SubArray<1, T, DeviceType> subArray; @@ -348,20 +251,10 @@ MGARDX_CONT SubArray<1, T, DeviceType> SubArray::Linearize() { SIZE linearized_shape = 1; for (DIM d = 0; d < D; d++) linearized_shape *= this->_shape[d]; - // subArray.shape = {linearized_shape}; subArray.dv = this->dv; - // subArray.ldvs_h = this->ldvs_h; - // subArray.ldvs_d = this->ldvs_d; - - this->_shape[0] = linearized_shape; - this->_ldvs[0] = linearized_shape; - - subArray.lddv1 = linearized_shape; - subArray.lddv2 = 1; - - subArray.projected_dim0 = this->projected_dim0; - subArray.projected_dim1 = this->projected_dim1; - subArray.projected_dim2 = this->projected_dim2; + subArray.setShape(0, linearized_shape); + subArray.setLd(0, linearized_shape); + subArray.project(0, 1, 2); if (this->has_host_pointer) { subArray.has_host_pointer = true; @@ -395,16 +288,6 @@ SubArray::Slice3D(DIM d1, DIM d2, DIM d3) { subArray.setLd(2, this->_ldvs[d3]); subArray.project(d1, d2, d3); - // subArray.ldvs_h = this->ldvs_h; - // subArray.ldvs_d = this->ldvs_d; - - // subArray.lddv1 = subArray.ldvs[0]; - // subArray.lddv2 = subArray.ldvs[1]; - - // subArray.projected_dim0 = d1; - // subArray.projected_dim1 = d2; - // subArray.projected_dim2 = d3; - if (this->has_host_pointer) { subArray.setDataHost(this->v); } @@ -417,13 +300,11 @@ MGARDX_CONT void SubArray::offset(std::vector idx) { SIZE _idx[D]; for (DIM d = 0; d < D; d++) _idx[d] = idx[d]; - // dv += get_idx(ldvs_h, idx); dv += calc_offset(_idx); } template MGARDX_CONT void SubArray::resize(std::vector shape) { - // this->shape = shape; for (DIM d = 0; d < D; d++) { _shape[d] = shape[d]; } @@ -432,10 +313,6 @@ MGARDX_CONT void SubArray::resize(std::vector shape) { template MGARDX_CONT void SubArray::offset(DIM dim, SIZE offset_value) { - // std::vector idx(D, 0); - // idx[dim] = offset_value; - // dv += get_idx(ldvs_h, idx); - SIZE idx[D]; for (DIM d = 0; d < D; d++) idx[d] = 0; @@ -463,10 +340,5 @@ MGARDX_CONT void SubArray::project(DIM dim0, DIM dim1, } } -template -MGARDX_CONT_EXEC SubArray::~SubArray() { - // nothing needs to be released -} - } // namespace mgard_x #endif \ No newline at end of file diff --git a/include/mgard-x/RuntimeX/DataStructures/SubArrayCopy.hpp b/include/mgard-x/RuntimeX/DataStructures/SubArrayCopy.hpp index 25b95290ce..2674a60ee9 100644 --- a/include/mgard-x/RuntimeX/DataStructures/SubArrayCopy.hpp +++ b/include/mgard-x/RuntimeX/DataStructures/SubArrayCopy.hpp @@ -13,9 +13,9 @@ namespace mgard_x { template class CompatibleDeviceType { using DeviceType = std::conditional< - std::is_same::value && - std::is_same::value, - Serial, + std::is_same::value && + std::is_same::value, + SERIAL, std::conditional< std::is_same::value || std::is_same::value, @@ -26,7 +26,7 @@ class CompatibleDeviceType { HIP, std::conditional::value || std::is_same::value, - KOKKOS, None>>>>; + KOKKOS, NONE>>>>; }; template diff --git a/include/mgard-x/RuntimeX/DataTypes.h b/include/mgard-x/RuntimeX/DataTypes.h index 5a63a7397f..ca9867d9fe 100644 --- a/include/mgard-x/RuntimeX/DataTypes.h +++ b/include/mgard-x/RuntimeX/DataTypes.h @@ -12,6 +12,8 @@ #define MGARDX_COMPILE_CUDA #elif defined __HIPCC__ #define MGARDX_COMPILE_HIP +#elif defined SYCL_LANGUAGE_VERSION +#define MGARDX_COMPILE_SYCL #else #define MGARDX_COMPILE_SERIAL #endif @@ -58,6 +60,14 @@ namespace mgard_x { #define MGARDX_MANAGED __managed__ #endif +#ifdef MGARDX_COMPILE_SYCL +#define MGARDX_CONT __inline__ +#define MGARDX_KERL +#define MGARDX_EXEC __inline__ +#define MGARDX_CONT_EXEC __inline__ +#define MGARDX_MANAGED +#endif + #if defined MGARDX_COMPILE_KOKKOS #define MGARDX_CONT __inline__ #define MGARDX_KERL @@ -81,15 +91,16 @@ namespace mgard_x { #define SUBTRACT 2 class Device {}; -class Serial : public Device {}; +class SERIAL : public Device {}; class CUDA : public Device {}; class HIP : public Device {}; -class None : public Device {}; +class SYCL : public Device {}; +class NONE : public Device {}; #if defined MGARDX_COMPILE_KOKKOS using KOKKOS = Kokkos::DefaultExecutionSpace; #else -using KOKKOS = None; +using KOKKOS = NONE; #endif class DPCxx : public Device {}; diff --git a/include/mgard-x/RuntimeX/DeviceAdapters/DeviceAdapter.h b/include/mgard-x/RuntimeX/DeviceAdapters/DeviceAdapter.h index a02a1247af..a8a5da6387 100644 --- a/include/mgard-x/RuntimeX/DeviceAdapters/DeviceAdapter.h +++ b/include/mgard-x/RuntimeX/DeviceAdapters/DeviceAdapter.h @@ -16,7 +16,8 @@ namespace mgard_x { struct ExecutionReturn { - double execution_time = 0.0; + bool success = true; + double execution_time = std::numeric_limits::max(); }; template struct SyncBlock { @@ -27,10 +28,22 @@ template struct SyncGrid { MGARDX_EXEC static void Sync(); }; -template struct Atomic { - template MGARDX_EXEC static T Min(T *result, T value); - template MGARDX_EXEC static T Max(T *result, T value); - template MGARDX_EXEC static T Add(T *result, T value); +#define AtomicSystemScope 0 +#define AtomicDeviceScope 1 +#define AtomicBlockScope 2 + +#define AtomicGlobalMemory 0 +#define AtomicSharedMemory 1 + +#define RESOURCE_ENOUGH 0 +#define THREADBLOCK_TOO_LARGE 1 +#define SHARED_MEMORY_TOO_LARGE 2 + +template +struct Atomic { + MGARDX_EXEC static T Min(T *result, T value); + MGARDX_EXEC static T Max(T *result, T value); + MGARDX_EXEC static T Add(T *result, T value); }; template struct Math { @@ -99,6 +112,7 @@ template class DeviceSpecification { int *MaxNumThreadsPerSM; int *MaxNumThreadsPerTB; size_t *AvailableMemory; + std::string *DeviceNames; }; template class DeviceQueues { @@ -124,9 +138,17 @@ template class DeviceAdapter { MGARDX_CONT DeviceAdapter(){}; MGARDX_CONT + int IsResourceEnough() { return false; } + MGARDX_CONT ExecutionReturn Execute(){}; }; +template struct KeyValueComparator { + bool operator()(std::pair a, std::pair b) const { + return a.first < b.first; + } +}; + template class DeviceCollective { public: template MGARDX_CONT DeviceCollective(){}; @@ -215,6 +237,8 @@ template class DeviceRuntime { MGARDX_CONT static void SyncDevice() {} + MGARDX_CONT static std::string GetDeviceName() { return ""; } + MGARDX_CONT ~DeviceRuntime() {} diff --git a/include/mgard-x/RuntimeX/DeviceAdapters/DeviceAdapterCuda.h b/include/mgard-x/RuntimeX/DeviceAdapters/DeviceAdapterCuda.h index ec7ec4435a..c607ea5ee6 100644 --- a/include/mgard-x/RuntimeX/DeviceAdapters/DeviceAdapterCuda.h +++ b/include/mgard-x/RuntimeX/DeviceAdapters/DeviceAdapterCuda.h @@ -47,6 +47,28 @@ static __device__ __inline__ uint32_t __mylaneid() { return laneid; } +MGARDX_EXEC static float atomicMax(float *address, float val) { + int *address_as_i = (int *)address; + int old = *address_as_i, assumed; + do { + assumed = old; + old = ::atomicCAS(address_as_i, assumed, + __float_as_int(::fmaxf(val, __int_as_float(assumed)))); + } while (assumed != old); + return __int_as_float(old); +} + +MGARDX_EXEC static double atomicMax(double *address, double val) { + unsigned long long int *address_as_i = (unsigned long long int *)address; + unsigned long long int old = *address_as_i, assumed; + do { + assumed = old; + old = ::atomicCAS(address_as_i, assumed, + (unsigned long long int)::fmax(val, (double)assumed)); + } while (assumed != old); + return (double)old; +} + namespace mgard_x { template @@ -92,15 +114,34 @@ template <> struct SyncGrid { MGARDX_EXEC static void Sync() { cg::this_grid().sync(); } }; -template <> struct Atomic { - template MGARDX_EXEC static T Min(T *result, T value) { - return atomicMin(result, value); +template +struct Atomic { + MGARDX_EXEC static T Min(T *result, T value) { + if constexpr (Scope == AtomicSystemScope) { + return atomicMin_system(result, value); + } else if constexpr (Scope == AtomicDeviceScope) { + return atomicMin(result, value); + } else { + return atomicMin_block(result, value); + } } - template MGARDX_EXEC static T Max(T *result, T value) { - return atomicMax(result, value); + MGARDX_EXEC static T Max(T *result, T value) { + if constexpr (Scope == AtomicSystemScope) { + return atomicMax_system(result, value); + } else if constexpr (Scope == AtomicDeviceScope) { + return atomicMax(result, value); + } else { + return atomicMax_block(result, value); + } } - template MGARDX_EXEC static T Add(T *result, T value) { - return atomicAdd(result, value); + MGARDX_EXEC static T Add(T *result, T value) { + if constexpr (Scope == AtomicSystemScope) { + return atomicAdd_system(result, value); + } else if constexpr (Scope == AtomicDeviceScope) { + return atomicAdd(result, value); + } else { + return atomicAdd_block(result, value); + } } }; @@ -353,6 +394,7 @@ template <> class DeviceSpecification { MaxNumThreadsPerTB = new int[NumDevices]; AvailableMemory = new size_t[NumDevices]; SupportCooperativeGroups = new bool[NumDevices]; + DeviceNames = new std::string[NumDevices]; for (int d = 0; d < NumDevices; d++) { gpuErrchk(cudaSetDevice(d)); @@ -377,6 +419,7 @@ template <> class DeviceSpecification { } else if (prop.major == 7 && (prop.minor == 2 || prop.minor == 5)) { ArchitectureGeneration[d] = 2; } + DeviceNames[d] = std::string(prop.name); } } @@ -415,6 +458,10 @@ template <> class DeviceSpecification { return SupportCooperativeGroups[dev_id]; } + MGARDX_CONT std::string GetDeviceName(int dev_id) { + return DeviceNames[dev_id]; + } + MGARDX_CONT ~DeviceSpecification() { delete[] MaxSharedMemorySize; @@ -425,6 +472,7 @@ template <> class DeviceSpecification { delete[] MaxNumThreadsPerTB; delete[] AvailableMemory; delete[] SupportCooperativeGroups; + delete[] DeviceNames; } int NumDevices; @@ -436,6 +484,7 @@ template <> class DeviceSpecification { int *MaxNumThreadsPerTB; size_t *AvailableMemory; bool *SupportCooperativeGroups; + std::string *DeviceNames; }; template <> class DeviceQueues { @@ -516,6 +565,10 @@ template <> class DeviceRuntime { gpuErrchk(cudaDeviceSynchronize()); } + MGARDX_CONT static std::string GetDeviceName() { + return DeviceSpecs.GetDeviceName(curr_dev_id); + } + MGARDX_CONT static int GetMaxSharedMemorySize() { return DeviceSpecs.GetMaxSharedMemorySize(curr_dev_id); } @@ -1816,6 +1869,19 @@ template class DeviceAdapter { MGARDX_CONT DeviceAdapter(){}; + MGARDX_CONT + int IsResourceEnough(TaskType &task) { + if (task.GetBlockDimX() * task.GetBlockDimY() * task.GetBlockDimZ() > + DeviceRuntime::GetMaxNumThreadsPerTB()) { + return THREADBLOCK_TOO_LARGE; + } + if (task.GetSharedMemorySize() > + DeviceRuntime::GetMaxSharedMemorySize()) { + return SHARED_MEMORY_TOO_LARGE; + } + return RESOURCE_ENOUGH; + } + MGARDX_CONT ExecutionReturn Execute(TaskType &task) { @@ -1834,6 +1900,21 @@ template class DeviceAdapter { << task.GetGridDimY() << ", " << task.GetGridDimZ() << ">\n"; } + ExecutionReturn ret; + if (IsResourceEnough(task) != RESOURCE_ENOUGH) { + if (DeviceRuntime::PrintKernelConfig) { + if (IsResourceEnough(task) == THREADBLOCK_TOO_LARGE) { + std::cout << log::log_info << "threadblock too large.\n"; + } + if (IsResourceEnough(task) == SHARED_MEMORY_TOO_LARGE) { + std::cout << log::log_info << "shared memory too large.\n"; + } + } + ret.success = false; + ret.execution_time = std::numeric_limits::max(); + return ret; + } + Timer timer; if (DeviceRuntime::TimingAllKernels || AutoTuner::ProfileKernels) { @@ -1875,8 +1956,6 @@ template class DeviceAdapter { ErrorSyncCheck(cudaDeviceSynchronize(), task); } - ExecutionReturn ret; - if (DeviceRuntime::TimingAllKernels || AutoTuner::ProfileKernels) { DeviceRuntime::SyncDevice(); @@ -1885,6 +1964,7 @@ template class DeviceAdapter { timer.print(task.GetFunctorName()); } if (AutoTuner::ProfileKernels) { + ret.success = true; ret.execution_time = timer.get(); } } diff --git a/include/mgard-x/RuntimeX/DeviceAdapters/DeviceAdapterHip.h b/include/mgard-x/RuntimeX/DeviceAdapters/DeviceAdapterHip.h index d8a245ceaa..70f35338fa 100644 --- a/include/mgard-x/RuntimeX/DeviceAdapters/DeviceAdapterHip.h +++ b/include/mgard-x/RuntimeX/DeviceAdapters/DeviceAdapterHip.h @@ -87,15 +87,34 @@ template <> struct SyncGrid { MGARDX_EXEC static void Sync() { cg::this_grid().sync(); } }; -template <> struct Atomic { - template MGARDX_EXEC static T Min(T *result, T value) { - return atomicMin(result, value); +template +struct Atomic { + MGARDX_EXEC static T Min(T *result, T value) { + if constexpr (Scope == AtomicSystemScope) { + return atomicMin_system(result, value); + } else if constexpr (Scope == AtomicDeviceScope) { + return atomicMin(result, value); + } else { + return atomicMin_block(result, value); + } } - template MGARDX_EXEC static T Max(T *result, T value) { - return atomicMax(result, value); + MGARDX_EXEC static T Max(T *result, T value) { + if constexpr (Scope == AtomicSystemScope) { + return atomicMax_system(result, value); + } else if constexpr (Scope == AtomicDeviceScope) { + return atomicMax(result, value); + } else { + return atomicMax_block(result, value); + } } - template MGARDX_EXEC static T Add(T *result, T value) { - return atomicAdd(result, value); + MGARDX_EXEC static T Add(T *result, T value) { + if constexpr (Scope == AtomicSystemScope) { + return atomicAdd_system(result, value); + } else if constexpr (Scope == AtomicDeviceScope) { + return atomicAdd(result, value); + } else { + return atomicAdd_block(result, value); + } } }; @@ -348,6 +367,7 @@ template <> class DeviceSpecification { MaxNumThreadsPerTB = new int[NumDevices]; AvailableMemory = new size_t[NumDevices]; SupportCooperativeGroups = new bool[NumDevices]; + DeviceNames = new std::string[NumDevices]; for (int d = 0; d < NumDevices; d++) { gpuErrchk(hipSetDevice(d)); @@ -373,6 +393,8 @@ template <> class DeviceSpecification { } MaxNumThreadsPerTB[d] = 32; // Due to a bug in Cooperative Groups in HIP WarpSize[d] = 32; + // DeviceNames[d] = std::string(prop.name); // Not working in HIP + DeviceNames[d] = std::string("AMD GPU"); } } @@ -410,6 +432,10 @@ template <> class DeviceSpecification { return SupportCooperativeGroups[dev_id]; } + MGARDX_CONT std::string GetDeviceName(int dev_id) { + return DeviceNames[dev_id]; + } + MGARDX_CONT ~DeviceSpecification() { delete[] MaxSharedMemorySize; @@ -420,6 +446,7 @@ template <> class DeviceSpecification { delete[] MaxNumThreadsPerTB; delete[] AvailableMemory; delete[] SupportCooperativeGroups; + delete[] DeviceNames; } int NumDevices; @@ -431,6 +458,7 @@ template <> class DeviceSpecification { int *MaxNumThreadsPerTB; size_t *AvailableMemory; bool *SupportCooperativeGroups; + std::string *DeviceNames; }; template <> class DeviceQueues { @@ -511,6 +539,10 @@ template <> class DeviceRuntime { gpuErrchk(hipDeviceSynchronize()); } + MGARDX_CONT static std::string GetDeviceName() { + return DeviceSpecs.GetDeviceName(curr_dev_id); + } + MGARDX_CONT static int GetMaxSharedMemorySize() { return DeviceSpecs.GetMaxSharedMemorySize(curr_dev_id); } @@ -1771,6 +1803,19 @@ template class DeviceAdapter { MGARDX_CONT DeviceAdapter(){}; + MGARDX_CONT + int IsResourceEnough(TaskType &task) { + if (task.GetBlockDimX() * task.GetBlockDimY() * task.GetBlockDimZ() > + DeviceRuntime::GetMaxNumThreadsPerTB()) { + return THREADBLOCK_TOO_LARGE; + } + if (task.GetSharedMemorySize() > + DeviceRuntime::GetMaxSharedMemorySize()) { + return SHARED_MEMORY_TOO_LARGE; + } + return RESOURCE_ENOUGH; + } + MGARDX_CONT ExecutionReturn Execute(TaskType &task) { dim3 threadsPerBlock(task.GetBlockDimX(), task.GetBlockDimY(), @@ -1790,6 +1835,21 @@ template class DeviceAdapter { << task.GetGridDimY() << ", " << task.GetGridDimZ() << ">\n"; } + ExecutionReturn ret; + if (IsResourceEnough(task) != RESOURCE_ENOUGH) { + if (DeviceRuntime::PrintKernelConfig) { + if (IsResourceEnough(task) == THREADBLOCK_TOO_LARGE) { + std::cout << log::log_info << "threadblock too large.\n"; + } + if (IsResourceEnough(task) == SHARED_MEMORY_TOO_LARGE) { + std::cout << log::log_info << "shared memory too large.\n"; + } + } + ret.success = false; + ret.execution_time = std::numeric_limits::max(); + return ret; + } + Timer timer; if (DeviceRuntime::TimingAllKernels || AutoTuner::ProfileKernels) { @@ -1831,7 +1891,6 @@ template class DeviceAdapter { ErrorSyncCheck(hipDeviceSynchronize(), task); } - ExecutionReturn ret; if (DeviceRuntime::TimingAllKernels || AutoTuner::ProfileKernels) { DeviceRuntime::SyncDevice(); @@ -1840,6 +1899,7 @@ template class DeviceAdapter { timer.print(task.GetFunctorName()); } if (AutoTuner::ProfileKernels) { + ret.success = true; ret.execution_time = timer.get(); } } diff --git a/include/mgard-x/RuntimeX/DeviceAdapters/DeviceAdapterSerial.h b/include/mgard-x/RuntimeX/DeviceAdapters/DeviceAdapterSerial.h index 2b7e94c83e..43bfa198c3 100644 --- a/include/mgard-x/RuntimeX/DeviceAdapters/DeviceAdapterSerial.h +++ b/include/mgard-x/RuntimeX/DeviceAdapters/DeviceAdapterSerial.h @@ -12,30 +12,31 @@ namespace mgard_x { -template <> struct SyncBlock { +template <> struct SyncBlock { MGARDX_EXEC static void Sync() { // do nothing } }; -template <> struct SyncGrid { +template <> struct SyncGrid { MGARDX_EXEC static void Sync() { // do nothing } }; -template <> struct Atomic { - template MGARDX_EXEC static T Min(T *result, T value) { +template +struct Atomic { + MGARDX_EXEC static T Min(T *result, T value) { T old = *result; *result = std::min(*result, value); return old; } - template MGARDX_EXEC static T Max(T *result, T value) { + MGARDX_EXEC static T Max(T *result, T value) { T old = *result; *result = std::max(*result, value); return old; } - template MGARDX_EXEC static T Add(T *result, T value) { + MGARDX_EXEC static T Add(T *result, T value) { T old = *result; *result += value; return old; @@ -62,7 +63,7 @@ static const int MultiplyDeBruijnBitPosition[64] = { 62, 11, 23, 32, 36, 44, 52, 55, 61, 22, 43, 51, 60, 42, 59, 58}; -template <> struct Math { +template <> struct Math { template MGARDX_EXEC static T Min(T a, T b) { return std::min(a, b); } @@ -617,7 +618,7 @@ MGARDX_KERL void SerialHuffmanCWCustomizedKernel(TaskType task) { DEALLOC_ACTIVE_GRID(loop1_active); } -template <> class DeviceSpecification { +template <> class DeviceSpecification { public: MGARDX_CONT DeviceSpecification() { @@ -630,6 +631,7 @@ template <> class DeviceSpecification { MaxNumThreadsPerTB = new int[NumDevices]; AvailableMemory = new size_t[NumDevices]; SupportCooperativeGroups = new bool[NumDevices]; + DeviceNames = new std::string[NumDevices]; for (int d = 0; d < NumDevices; d++) { MaxSharedMemorySize[d] = 1e6; @@ -639,6 +641,7 @@ template <> class DeviceSpecification { MaxNumThreadsPerTB[d] = 1024; ArchitectureGeneration[d] = 1; SupportCooperativeGroups[d] = true; + DeviceNames[d] = "CPU"; } } @@ -673,6 +676,10 @@ template <> class DeviceSpecification { return SupportCooperativeGroups[dev_id]; } + MGARDX_CONT std::string GetDeviceName(int dev_id) { + return DeviceNames[dev_id]; + } + MGARDX_CONT ~DeviceSpecification() { delete[] MaxSharedMemorySize; @@ -683,6 +690,7 @@ template <> class DeviceSpecification { delete[] MaxNumThreadsPerTB; delete[] AvailableMemory; delete[] SupportCooperativeGroups; + delete[] DeviceNames; } int NumDevices; @@ -694,9 +702,10 @@ template <> class DeviceSpecification { int *MaxNumThreadsPerTB; size_t *AvailableMemory; bool *SupportCooperativeGroups; + std::string *DeviceNames; }; -template <> class DeviceQueues { +template <> class DeviceQueues { public: MGARDX_CONT DeviceQueues() { @@ -719,7 +728,7 @@ template <> class DeviceQueues { } }; -template <> class DeviceRuntime { +template <> class DeviceRuntime { public: MGARDX_CONT DeviceRuntime() {} @@ -744,6 +753,10 @@ template <> class DeviceRuntime { // do nothing } + MGARDX_CONT static std::string GetDeviceName() { + return DeviceSpecs.GetDeviceName(curr_dev_id); + } + MGARDX_CONT static int GetMaxSharedMemorySize() { return DeviceSpecs.GetMaxSharedMemorySize(curr_dev_id); } @@ -793,14 +806,14 @@ template <> class DeviceRuntime { ~DeviceRuntime() {} static int curr_dev_id; - static DeviceQueues queues; + static DeviceQueues queues; static bool SyncAllKernelsAndCheckErrors; - static DeviceSpecification DeviceSpecs; + static DeviceSpecification DeviceSpecs; static bool TimingAllKernels; static bool PrintKernelConfig; }; -template <> class MemoryManager { +template <> class MemoryManager { public: MGARDX_CONT MemoryManager(){}; @@ -811,7 +824,7 @@ template <> class MemoryManager { typename std::conditional::value, Byte, T>::type; ptr = (T *)std::malloc(n * sizeof(converted_T)); if (ptr == NULL) { - std::cout << log::log_err << "MemoryManager::Malloc1D error.\n"; + std::cout << log::log_err << "MemoryManager::Malloc1D error.\n"; } } @@ -823,7 +836,7 @@ template <> class MemoryManager { ptr = (T *)std::malloc(n1 * n2 * sizeof(converted_T)); ld = n1; if (ptr == NULL) { - std::cout << log::log_err << "MemoryManager::Malloc1D error.\n"; + std::cout << log::log_err << "MemoryManager::Malloc1D error.\n"; } } @@ -834,7 +847,7 @@ template <> class MemoryManager { ptr = (T *)std::malloc(n * sizeof(converted_T)); if (ptr == NULL) { std::cout << log::log_err - << "MemoryManager::MallocManaged1D error.\n"; + << "MemoryManager::MallocManaged1D error.\n"; } } @@ -866,7 +879,7 @@ template <> class MemoryManager { typename std::conditional::value, Byte, T>::type; ptr = (T *)std::malloc(n * sizeof(converted_T)); if (ptr == NULL) { - std::cout << log::log_err << "MemoryManager::Malloc1D error.\n"; + std::cout << log::log_err << "MemoryManager::Malloc1D error.\n"; } } @@ -940,7 +953,7 @@ typedef unsigned long long int uint64_cu; template struct BlockBitTranspose { + METHOD, SERIAL> { MGARDX_EXEC static void Serial_All(T_org *v, T_trans *tv, SIZE b, SIZE B, SIZE IdX, @@ -988,7 +1001,7 @@ template struct BlockErrorCollect { + METHOD, BinaryType, SERIAL> { MGARDX_EXEC static void Serial_All(T *v, T_error *temp, T_error *errors, SIZE num_elems, @@ -998,7 +1011,7 @@ struct BlockErrorCollect::binary2negabinary(fps_data); + T_fp ngb_data = Math::binary2negabinary(fps_data); T_error mantissa; if (BinaryType == BINARY) { mantissa = fabs(data) - fp_data; @@ -1012,7 +1025,7 @@ struct BlockErrorCollect::negabinary2binary(ngb_data & mask) + + diff = (T_error)Math::negabinary2binary(ngb_data & mask) + mantissa; } errors[num_bitplanes - bitplane_idx] += diff * diff; @@ -1032,56 +1045,81 @@ struct BlockErrorCollect class DeviceAdapter { +template class DeviceAdapter { public: MGARDX_CONT DeviceAdapter(){}; MGARDX_CONT - ExecutionReturn Execute(TaskTypeType &task) { + int IsResourceEnough(TaskType &task) { + if (task.GetBlockDimX() * task.GetBlockDimY() * task.GetBlockDimZ() > + DeviceRuntime::GetMaxNumThreadsPerTB()) { + return THREADBLOCK_TOO_LARGE; + } + if (task.GetSharedMemorySize() > + DeviceRuntime::GetMaxSharedMemorySize()) { + return SHARED_MEMORY_TOO_LARGE; + } + return RESOURCE_ENOUGH; + } + + MGARDX_CONT + ExecutionReturn Execute(TaskType &task) { - if (DeviceRuntime::PrintKernelConfig) { + if (DeviceRuntime::PrintKernelConfig) { std::cout << log::log_info << task.GetFunctorName() << ": <" << task.GetBlockDimX() << ", " << task.GetBlockDimY() << ", " << task.GetBlockDimZ() << "> <" << task.GetGridDimX() << ", " << task.GetGridDimY() << ", " << task.GetGridDimZ() << ">\n"; } + ExecutionReturn ret; + if (IsResourceEnough(task) != RESOURCE_ENOUGH) { + if (DeviceRuntime::PrintKernelConfig) { + if (IsResourceEnough(task) == THREADBLOCK_TOO_LARGE) { + std::cout << log::log_info << "threadblock too large.\n"; + } + if (IsResourceEnough(task) == SHARED_MEMORY_TOO_LARGE) { + std::cout << log::log_info << "shared memory too large.\n"; + } + } + ret.success = false; + ret.execution_time = std::numeric_limits::max(); + return ret; + } + Timer timer; - if (DeviceRuntime::TimingAllKernels || - AutoTuner::ProfileKernels) { - DeviceRuntime::SyncDevice(); + if (DeviceRuntime::TimingAllKernels || + AutoTuner::ProfileKernels) { + DeviceRuntime::SyncDevice(); timer.start(); } // if constexpr evalute at compile time otherwise this does not compile - if constexpr (std::is_base_of, - typename TaskTypeType::Functor>::value) { + if constexpr (std::is_base_of, + typename TaskType::Functor>::value) { SerialKernel(task); - } else if constexpr (std::is_base_of< - IterFunctor, - typename TaskTypeType::Functor>::value) { + } else if constexpr (std::is_base_of, + typename TaskType::Functor>::value) { SerialIterKernel(task); - } else if constexpr (std::is_base_of< - HuffmanCLCustomizedFunctor, - typename TaskTypeType::Functor>::value) { + } else if constexpr (std::is_base_of, + typename TaskType::Functor>::value) { SerialHuffmanCLCustomizedKernel(task); - } else if constexpr (std::is_base_of< - HuffmanCWCustomizedFunctor, - typename TaskTypeType::Functor>::value) { + } else if constexpr (std::is_base_of, + typename TaskType::Functor>::value) { SerialHuffmanCWCustomizedKernel(task); } // timer.end(); // timer.print(task.GetFunctorName()); // timer.clear(); - ExecutionReturn ret; - if (DeviceRuntime::TimingAllKernels || - AutoTuner::ProfileKernels) { - DeviceRuntime::SyncDevice(); + if (DeviceRuntime::TimingAllKernels || + AutoTuner::ProfileKernels) { + DeviceRuntime::SyncDevice(); timer.end(); - if (DeviceRuntime::TimingAllKernels) { + if (DeviceRuntime::TimingAllKernels) { timer.print(task.GetFunctorName()); } - if (AutoTuner::ProfileKernels) { + if (AutoTuner::ProfileKernels) { + ret.success = true; ret.execution_time = timer.get(); } } @@ -1089,26 +1127,20 @@ template class DeviceAdapter { } }; -template struct KeyValueComparator { - bool operator()(std::pair a, std::pair b) const { - return a.first < b.first; - } -}; - -template <> class DeviceCollective { +template <> class DeviceCollective { public: MGARDX_CONT DeviceCollective(){}; template - MGARDX_CONT static void Sum(SIZE n, SubArray<1, T, Serial> &v, - SubArray<1, T, Serial> &result, int queue_idx) { + MGARDX_CONT static void Sum(SIZE n, SubArray<1, T, SERIAL> &v, + SubArray<1, T, SERIAL> &result, int queue_idx) { *result((IDX)0) = std::accumulate(v((IDX)0), v((IDX)n), 0); } template - MGARDX_CONT static void AbsMax(SIZE n, SubArray<1, T, Serial> &v, - SubArray<1, T, Serial> &result, + MGARDX_CONT static void AbsMax(SIZE n, SubArray<1, T, SERIAL> &v, + SubArray<1, T, SERIAL> &result, int queue_idx) { T max_result = 0; for (SIZE i = 0; i < n; ++i) { @@ -1118,8 +1150,8 @@ template <> class DeviceCollective { } template - MGARDX_CONT static void SquareSum(SIZE n, SubArray<1, T, Serial> &v, - SubArray<1, T, Serial> &result, + MGARDX_CONT static void SquareSum(SIZE n, SubArray<1, T, SERIAL> &v, + SubArray<1, T, SERIAL> &result, int queue_idx) { T sum_result = 0; for (SIZE i = 0; i < n; ++i) { @@ -1130,33 +1162,33 @@ template <> class DeviceCollective { } template - MGARDX_CONT static void ScanSumInclusive(SIZE n, SubArray<1, T, Serial> &v, - SubArray<1, T, Serial> &result, + MGARDX_CONT static void ScanSumInclusive(SIZE n, SubArray<1, T, SERIAL> &v, + SubArray<1, T, SERIAL> &result, int queue_idx) { // std::inclusive_scan(v(0), v(n), result(0)); - std::cout << log::log_err << "ScanSumInclusive not implemented.\n"; + std::cout << log::log_err << "ScanSumInclusive not implemented.\n"; } template - MGARDX_CONT static void ScanSumExclusive(SIZE n, SubArray<1, T, Serial> &v, - SubArray<1, T, Serial> &result, + MGARDX_CONT static void ScanSumExclusive(SIZE n, SubArray<1, T, SERIAL> &v, + SubArray<1, T, SERIAL> &result, int queue_idx) { // std::exclusive_scan(v(0), v(n), result(0)); - std::cout << log::log_err << "ScanSumExclusive not implemented.\n"; + std::cout << log::log_err << "ScanSumExclusive not implemented.\n"; } template - MGARDX_CONT static void ScanSumExtended(SIZE n, SubArray<1, T, Serial> &v, - SubArray<1, T, Serial> &result, + MGARDX_CONT static void ScanSumExtended(SIZE n, SubArray<1, T, SERIAL> &v, + SubArray<1, T, SERIAL> &result, int queue_idx) { // std::inclusive_scan(v(0), v(n), result(1)); // result(0) = 0; - std::cout << log::log_err << "ScanSumExtended not implemented.\n"; + std::cout << log::log_err << "ScanSumExtended not implemented.\n"; } template - MGARDX_CONT static void SortByKey(SIZE n, SubArray<1, KeyT, Serial> &keys, - SubArray<1, ValueT, Serial> &values, + MGARDX_CONT static void SortByKey(SIZE n, SubArray<1, KeyT, SERIAL> &keys, + SubArray<1, ValueT, SERIAL> &values, int queue_idx) { std::vector> data(n); for (SIZE i = 0; i < n; ++i) { diff --git a/include/mgard-x/RuntimeX/DeviceAdapters/DeviceAdapterSycl.h b/include/mgard-x/RuntimeX/DeviceAdapters/DeviceAdapterSycl.h new file mode 100644 index 0000000000..e39ea274e0 --- /dev/null +++ b/include/mgard-x/RuntimeX/DeviceAdapters/DeviceAdapterSycl.h @@ -0,0 +1,1151 @@ +/* + * Copyright 2022, Oak Ridge National Laboratory. + * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs + * Author: Jieyang Chen (chenj3@ornl.gov) + * Date: March 17, 2022 + */ + +#include "DeviceAdapter.h" +#include + +#ifndef MGARD_X_DEVICE_ADAPTER_SYCL_H +#define MGARD_X_DEVICE_ADAPTER_SYCL_H + +namespace mgard_x { + +using LocalMemory = sycl::accessor; + +// Create an exception sycl::handler for asynchronous SYCL exceptions +static auto exception_handler = [](sycl::exception_list e_list) { + for (std::exception_ptr const &e : e_list) { + try { + std::rethrow_exception(e); + } catch (std::exception const &e) { + std::cout << "Failure" << std::endl; + std::terminate(); + } + } +}; + +template +struct Atomic { + MGARDX_EXEC static T Min(T *result, T value) { + if constexpr (MemoryType == AtomicGlobalMemory) { + if constexpr (Scope == AtomicSystemScope) { + using AtomicRef = sycl::ext::oneapi::atomic_ref< + T, sycl::memory_order::relaxed, sycl::memory_scope::system, + sycl::access::address_space::global_space>; + return AtomicRef(result[0]).fetch_min(value); + } else if constexpr (Scope == AtomicDeviceScope) { + using AtomicRef = sycl::ext::oneapi::atomic_ref< + T, sycl::memory_order::relaxed, sycl::memory_scope::device, + sycl::access::address_space::global_space>; + return AtomicRef(result[0]).fetch_min(value); + } else { + using AtomicRef = sycl::ext::oneapi::atomic_ref< + T, sycl::memory_order::relaxed, sycl::memory_scope::work_group, + sycl::access::address_space::global_space>; + return AtomicRef(result[0]).fetch_min(value); + } + } else { + if constexpr (Scope == AtomicSystemScope) { + using AtomicRef = sycl::ext::oneapi::atomic_ref< + T, sycl::memory_order::relaxed, sycl::memory_scope::system, + sycl::access::address_space::local_space>; + return AtomicRef(result[0]).fetch_min(value); + } else if constexpr (Scope == AtomicDeviceScope) { + using AtomicRef = sycl::ext::oneapi::atomic_ref< + T, sycl::memory_order::relaxed, sycl::memory_scope::device, + sycl::access::address_space::local_space>; + return AtomicRef(result[0]).fetch_min(value); + } else { + using AtomicRef = sycl::ext::oneapi::atomic_ref< + T, sycl::memory_order::relaxed, sycl::memory_scope::work_group, + sycl::access::address_space::local_space>; + return AtomicRef(result[0]).fetch_min(value); + } + } + } + MGARDX_EXEC static T Max(T *result, T value) { + if constexpr (MemoryType == AtomicGlobalMemory) { + if constexpr (Scope == AtomicSystemScope) { + using AtomicRef = sycl::ext::oneapi::atomic_ref< + T, sycl::memory_order::relaxed, sycl::memory_scope::system, + sycl::access::address_space::global_space>; + return AtomicRef(result[0]).fetch_max(value); + } else if constexpr (Scope == AtomicDeviceScope) { + using AtomicRef = sycl::ext::oneapi::atomic_ref< + T, sycl::memory_order::relaxed, sycl::memory_scope::device, + sycl::access::address_space::global_space>; + return AtomicRef(result[0]).fetch_max(value); + } else { + using AtomicRef = sycl::ext::oneapi::atomic_ref< + T, sycl::memory_order::relaxed, sycl::memory_scope::work_group, + sycl::access::address_space::global_space>; + return AtomicRef(result[0]).fetch_max(value); + } + } else { + if constexpr (Scope == AtomicSystemScope) { + using AtomicRef = sycl::ext::oneapi::atomic_ref< + T, sycl::memory_order::relaxed, sycl::memory_scope::system, + sycl::access::address_space::local_space>; + return AtomicRef(result[0]).fetch_max(value); + } else if constexpr (Scope == AtomicDeviceScope) { + using AtomicRef = sycl::ext::oneapi::atomic_ref< + T, sycl::memory_order::relaxed, sycl::memory_scope::device, + sycl::access::address_space::local_space>; + return AtomicRef(result[0]).fetch_max(value); + } else { + using AtomicRef = sycl::ext::oneapi::atomic_ref< + T, sycl::memory_order::relaxed, sycl::memory_scope::work_group, + sycl::access::address_space::local_space>; + return AtomicRef(result[0]).fetch_max(value); + } + } + } + MGARDX_EXEC static T Add(T *result, T value) { + if constexpr (MemoryType == AtomicGlobalMemory) { + if constexpr (Scope == AtomicSystemScope) { + using AtomicRef = sycl::ext::oneapi::atomic_ref< + T, sycl::memory_order::relaxed, sycl::memory_scope::system, + sycl::access::address_space::global_space>; + T new_value = AtomicRef(result[0]) += value; + return new_value - value; + } else if constexpr (Scope == AtomicDeviceScope) { + using AtomicRef = sycl::ext::oneapi::atomic_ref< + T, sycl::memory_order::relaxed, sycl::memory_scope::device, + sycl::access::address_space::global_space>; + T new_value = AtomicRef(result[0]) += value; + return new_value - value; + } else { + using AtomicRef = sycl::ext::oneapi::atomic_ref< + T, sycl::memory_order::relaxed, sycl::memory_scope::work_group, + sycl::access::address_space::global_space>; + T new_value = AtomicRef(result[0]) += value; + return new_value - value; + } + } else { + if constexpr (Scope == AtomicSystemScope) { + using AtomicRef = sycl::ext::oneapi::atomic_ref< + T, sycl::memory_order::relaxed, sycl::memory_scope::system, + sycl::access::address_space::local_space>; + T new_value = AtomicRef(result[0]) += value; + return new_value - value; + } else if constexpr (Scope == AtomicDeviceScope) { + using AtomicRef = sycl::ext::oneapi::atomic_ref< + T, sycl::memory_order::relaxed, sycl::memory_scope::device, + sycl::access::address_space::local_space>; + T new_value = AtomicRef(result[0]) += value; + return new_value - value; + } else { + using AtomicRef = sycl::ext::oneapi::atomic_ref< + T, sycl::memory_order::relaxed, sycl::memory_scope::work_group, + sycl::access::address_space::local_space>; + T new_value = AtomicRef(result[0]) += value; + return new_value - value; + } + } + } +}; + +template <> struct Math { + template MGARDX_EXEC static T Min(T a, T b) { + if constexpr (std::is_integral::value) + return sycl::min(a, b); + else { + return sycl::fmin(a, b); + } + } + template MGARDX_EXEC static T Max(T a, T b) { + if constexpr (std::is_integral::value) + return sycl::max(a, b); + else { + return sycl::fmax(a, b); + } + } + MGARDX_EXEC static int ffs(unsigned int a) { + int pos = 0; + if (a == 0) + return pos; + while (!(a & 1)) { + a >>= 1; + ++pos; + } + return pos + 1; + } + MGARDX_EXEC static int ffsll(long long unsigned int a) { + int pos = 0; + if (a == 0) + return pos; + while (!(a & 1)) { + a >>= 1; + ++pos; + } + return pos + 1; + } + MGARDX_EXEC + static uint64_t binary2negabinary(const int64_t x) { + return (x + (uint64_t)0xaaaaaaaaaaaaaaaaull) ^ + (uint64_t)0xaaaaaaaaaaaaaaaaull; + } + + MGARDX_EXEC + static uint32_t binary2negabinary(const int32_t x) { + return (x + (uint32_t)0xaaaaaaaau) ^ (uint32_t)0xaaaaaaaau; + } + + MGARDX_EXEC + static int64_t negabinary2binary(const uint64_t x) { + return (x ^ 0xaaaaaaaaaaaaaaaaull) - 0xaaaaaaaaaaaaaaaaull; + } + + MGARDX_EXEC + static int32_t negabinary2binary(const uint32_t x) { + return (x ^ 0xaaaaaaaau) - 0xaaaaaaaau; + } +}; + +template <> class DeviceSpecification { +public: + MGARDX_CONT + DeviceSpecification() { + sycl::default_selector d_selector; + sycl::platform d_platform(d_selector); + std::vector d_devices = d_platform.get_devices(); + NumDevices = d_devices.size(); + MaxSharedMemorySize = new int[NumDevices]; + WarpSize = new int[NumDevices]; + NumSMs = new int[NumDevices]; + ArchitectureGeneration = new int[NumDevices]; + MaxNumThreadsPerSM = new int[NumDevices]; + MaxNumThreadsPerTB = new int[NumDevices]; + AvailableMemory = new size_t[NumDevices]; + SupportCooperativeGroups = new bool[NumDevices]; + DeviceNames = new std::string[NumDevices]; + + int d = 0; + for (auto &device : d_devices) { + MaxSharedMemorySize[d] = + device.get_info(); + WarpSize[d] = 32; + NumSMs[d] = device.get_info(); + MaxNumThreadsPerSM[d] = + device.get_info(); + ; + // Larger limit can cause resource insufficient error + MaxNumThreadsPerTB[d] = std::min( + 1024ul, device.get_info()); + ; + AvailableMemory[d] = + device.get_info(); + SupportCooperativeGroups[d] = false; + DeviceNames[d] = std::string(device.get_info()); + d++; + } + } + + MGARDX_CONT int GetNumDevices() { return NumDevices; } + + MGARDX_CONT int GetMaxSharedMemorySize(int dev_id) { + return MaxSharedMemorySize[dev_id]; + } + + MGARDX_CONT int GetWarpSize(int dev_id) { return WarpSize[dev_id]; } + + MGARDX_CONT int GetNumSMs(int dev_id) { return NumSMs[dev_id]; } + + MGARDX_CONT int GetArchitectureGeneration(int dev_id) { + return ArchitectureGeneration[dev_id]; + } + + MGARDX_CONT int GetMaxNumThreadsPerSM(int dev_id) { + return MaxNumThreadsPerSM[dev_id]; + } + + MGARDX_CONT int GetMaxNumThreadsPerTB(int dev_id) { + return MaxNumThreadsPerTB[dev_id]; + } + + MGARDX_CONT size_t GetAvailableMemory(int dev_id) { + return AvailableMemory[dev_id]; + } + + MGARDX_CONT bool SupportCG(int dev_id) { + return SupportCooperativeGroups[dev_id]; + } + + MGARDX_CONT std::string GetDeviceName(int dev_id) { + return DeviceNames[dev_id]; + } + + MGARDX_CONT + ~DeviceSpecification() { + delete[] MaxSharedMemorySize; + delete[] WarpSize; + delete[] NumSMs; + delete[] ArchitectureGeneration; + delete[] MaxNumThreadsPerSM; + delete[] MaxNumThreadsPerTB; + delete[] AvailableMemory; + delete[] SupportCooperativeGroups; + delete[] DeviceNames; + } + + int NumDevices; + int *MaxSharedMemorySize; + int *WarpSize; + int *NumSMs; + int *ArchitectureGeneration; + int *MaxNumThreadsPerSM; + int *MaxNumThreadsPerTB; + size_t *AvailableMemory; + bool *SupportCooperativeGroups; + std::string *DeviceNames; +}; + +template <> class DeviceQueues { +public: + MGARDX_CONT + DeviceQueues() { + sycl::default_selector d_selector; + sycl::platform d_platform(d_selector); + std::vector d_devices = d_platform.get_devices(); + NumDevices = d_devices.size(); + queues = new sycl::queue *[NumDevices]; + for (SIZE d = 0; d < NumDevices; d++) { + queues[d] = new sycl::queue[MGARDX_NUM_QUEUES]; + for (SIZE i = 0; i < MGARDX_NUM_QUEUES; i++) { + queues[d][i] = sycl::queue(d_devices[d], exception_handler); + } + } + } + + MGARDX_CONT sycl::queue GetQueue(int dev_id, SIZE queue_id) { + return queues[dev_id][queue_id]; + } + + MGARDX_CONT void SyncQueue(int dev_id, SIZE queue_id) { + queues[dev_id][queue_id].wait(); + } + + MGARDX_CONT void SyncAllQueues(int dev_id) { + for (SIZE i = 0; i < MGARDX_NUM_QUEUES; i++) { + queues[dev_id][i].wait(); + } + } + + MGARDX_CONT + ~DeviceQueues() { + for (SIZE d = 0; d < NumDevices; d++) { + delete[] queues[d]; + } + delete[] queues; + queues = NULL; + } + + int NumDevices; + sycl::queue **queues = NULL; +}; + +template <> class DeviceRuntime { +public: + MGARDX_CONT + DeviceRuntime() {} + + MGARDX_CONT static int GetDeviceCount() { return DeviceSpecs.NumDevices; } + + MGARDX_CONT static void SelectDevice(SIZE dev_id) { curr_dev_id = dev_id; } + + MGARDX_CONT static sycl::queue GetQueue(SIZE queue_id) { + return queues.GetQueue(curr_dev_id, queue_id); + } + + MGARDX_CONT static void SyncQueue(SIZE queue_id) { + queues.SyncQueue(curr_dev_id, queue_id); + } + + MGARDX_CONT static void SyncAllQueues() { queues.SyncAllQueues(curr_dev_id); } + + MGARDX_CONT static void SyncDevice() { queues.SyncAllQueues(curr_dev_id); } + + MGARDX_CONT static std::string GetDeviceName() { + return DeviceSpecs.GetDeviceName(curr_dev_id); + } + + MGARDX_CONT static int GetMaxSharedMemorySize() { + return DeviceSpecs.GetMaxSharedMemorySize(curr_dev_id); + } + + MGARDX_CONT static int GetWarpSize() { + return DeviceSpecs.GetWarpSize(curr_dev_id); + } + + MGARDX_CONT static int GetNumSMs() { + return DeviceSpecs.GetNumSMs(curr_dev_id); + } + + MGARDX_CONT static int GetArchitectureGeneration() { + return DeviceSpecs.GetArchitectureGeneration(curr_dev_id); + } + + MGARDX_CONT static int GetMaxNumThreadsPerSM() { + return DeviceSpecs.GetMaxNumThreadsPerSM(curr_dev_id); + } + + MGARDX_CONT static int GetMaxNumThreadsPerTB() { + return DeviceSpecs.GetMaxNumThreadsPerTB(curr_dev_id); + } + + MGARDX_CONT static size_t GetAvailableMemory() { + return DeviceSpecs.GetAvailableMemory(curr_dev_id); + } + + MGARDX_CONT static bool SupportCG() { + return DeviceSpecs.SupportCG(curr_dev_id); + } + + template + MGARDX_CONT static int + GetOccupancyMaxActiveBlocksPerSM(FunctorType functor, int blockSize, + size_t dynamicSMemSize) { + return 32; + } + + template + MGARDX_CONT static void SetMaxDynamicSharedMemorySize(FunctorType functor, + int maxbytes) { + // do nothing + } + + MGARDX_CONT + ~DeviceRuntime() {} + + static int curr_dev_id; + static DeviceQueues queues; + static bool SyncAllKernelsAndCheckErrors; + static DeviceSpecification DeviceSpecs; + static bool TimingAllKernels; + static bool PrintKernelConfig; +}; + +template <> class MemoryManager { +public: + MGARDX_CONT + MemoryManager(){}; + + template + MGARDX_CONT static void Malloc1D(T *&ptr, SIZE n, int queue_idx) { + sycl::queue q = DeviceRuntime::GetQueue(queue_idx); + using converted_T = + typename std::conditional::value, Byte, T>::type; + ptr = malloc_device(n, q); + } + + template + MGARDX_CONT static void MallocND(T *&ptr, SIZE n1, SIZE n2, SIZE &ld, + int queue_idx) { + sycl::queue q = DeviceRuntime::GetQueue(queue_idx); + using converted_T = + typename std::conditional::value, Byte, T>::type; + ptr = malloc_device(n1 * n2, q); + ld = n1; + } + + template + MGARDX_CONT static void MallocManaged1D(T *&ptr, SIZE n, int queue_idx) { + sycl::queue q = DeviceRuntime::GetQueue(queue_idx); + using converted_T = + typename std::conditional::value, Byte, T>::type; + ptr = malloc_shared(n, q); + } + + template MGARDX_CONT static void Free(T *ptr) { + // printf("MemoryManager.Free(%llu)\n", ptr); + sycl::queue q = DeviceRuntime::GetQueue(0); + if (ptr == NULL) + return; + sycl::free(ptr, q); + } + + template + MGARDX_CONT static void Copy1D(T *dst_ptr, const T *src_ptr, SIZE n, + int queue_idx) { + sycl::queue q = DeviceRuntime::GetQueue(queue_idx); + using converted_T = + typename std::conditional::value, Byte, T>::type; + q.memcpy((converted_T *)dst_ptr, (converted_T *)src_ptr, + n * sizeof(converted_T)); + } + + template + MGARDX_CONT static void CopyND(T *dst_ptr, SIZE dst_ld, const T *src_ptr, + SIZE src_ld, SIZE n1, SIZE n2, int queue_idx) { + sycl::queue q = DeviceRuntime::GetQueue(queue_idx); + using converted_T = + typename std::conditional::value, Byte, T>::type; + q.memcpy((converted_T *)dst_ptr, (converted_T *)src_ptr, + n1 * n2 * sizeof(converted_T)); + } + + template + MGARDX_CONT static void MallocHost(T *&ptr, SIZE n, int queue_idx) { + sycl::queue q = DeviceRuntime::GetQueue(queue_idx); + using converted_T = + typename std::conditional::value, Byte, T>::type; + ptr = malloc_host(n, q); + } + + template MGARDX_CONT static void FreeHost(T *ptr) { + sycl::queue q = DeviceRuntime::GetQueue(0); + if (ptr == NULL) + return; + sycl::free(ptr, q); + } + + template + MGARDX_CONT static void Memset1D(T *ptr, SIZE n, int value, int queue_idx) { + sycl::queue q = DeviceRuntime::GetQueue(queue_idx); + using converted_T = + typename std::conditional::value, Byte, T>::type; + q.memset((converted_T *)ptr, value, n * sizeof(converted_T)); + } + + template + MGARDX_CONT static void MemsetND(T *ptr, SIZE ld, SIZE n1, SIZE n2, int value, + int queue_idx) { + sycl::queue q = DeviceRuntime::GetQueue(queue_idx); + using converted_T = + typename std::conditional::value, Byte, T>::type; + q.memset((converted_T *)ptr, value, n1 * n2 * sizeof(converted_T)); + } + + template MGARDX_CONT static bool IsDevicePointer(T *ptr) { + sycl::queue q = DeviceRuntime::GetQueue(0); + return sycl::get_pointer_type(ptr, q.get_context()) == + sycl::usm::alloc::device; + } + + static bool ReduceMemoryFootprint; +}; + +template class Kernel { +public: + Kernel(FunctorType functor, LocalMemory localAccess) + : functor(functor), localAccess(localAccess) {} + void operator()(sycl::nd_item<3> i) const { + FunctorType my_functor = functor; + sycl::local_ptr l_ptr = localAccess.get_pointer(); + Byte *shared_memory = l_ptr.get(); + my_functor.Init( + i.get_group_range(2), i.get_group_range(1), i.get_group_range(0), + i.get_global_range(2) / i.get_group_range(2), + i.get_global_range(1) / i.get_group_range(1), + i.get_global_range(0) / i.get_group_range(0), i.get_group().get_id(2), + i.get_group().get_id(1), i.get_group().get_id(0), i.get_local_id(2), + i.get_local_id(1), i.get_local_id(0), shared_memory); + + my_functor.Operation1(); + i.barrier(); + my_functor.Operation2(); + i.barrier(); + my_functor.Operation3(); + i.barrier(); + my_functor.Operation4(); + i.barrier(); + my_functor.Operation5(); + i.barrier(); + my_functor.Operation6(); + i.barrier(); + my_functor.Operation7(); + i.barrier(); + my_functor.Operation8(); + i.barrier(); + my_functor.Operation9(); + i.barrier(); + my_functor.Operation10(); + } + +private: + FunctorType functor; + LocalMemory localAccess; +}; + +template class IterKernel { +public: + IterKernel(FunctorType functor, LocalMemory localAccess) + : functor(functor), localAccess(localAccess) {} + void operator()(sycl::nd_item<3> i) const { + FunctorType my_functor = functor; + Byte *shared_memory = localAccess.get_pointer().get(); + my_functor.Init( + i.get_group_range(2), i.get_group_range(1), i.get_group_range(0), + i.get_global_range(2) / i.get_group_range(2), + i.get_global_range(1) / i.get_group_range(1), + i.get_global_range(0) / i.get_group_range(0), i.get_group().get_id(2), + i.get_group().get_id(1), i.get_group().get_id(0), i.get_local_id(2), + i.get_local_id(1), i.get_local_id(0), shared_memory); + + my_functor.Operation1(); + i.barrier(); + + my_functor.Operation2(); + i.barrier(); + + while (my_functor.LoopCondition1()) { + my_functor.Operation3(); + i.barrier(); + my_functor.Operation4(); + i.barrier(); + my_functor.Operation5(); + i.barrier(); + my_functor.Operation6(); + i.barrier(); + } + + my_functor.Operation7(); + i.barrier(); + my_functor.Operation8(); + i.barrier(); + my_functor.Operation9(); + i.barrier(); + my_functor.Operation10(); + i.barrier(); + + while (my_functor.LoopCondition2()) { + my_functor.Operation11(); + i.barrier(); + my_functor.Operation12(); + i.barrier(); + my_functor.Operation13(); + i.barrier(); + my_functor.Operation14(); + i.barrier(); + } + + my_functor.Operation15(); + i.barrier(); + my_functor.Operation16(); + i.barrier(); + my_functor.Operation17(); + i.barrier(); + } + +private: + FunctorType functor; + LocalMemory localAccess; +}; + +#define SINGLE_KERNEL(OPERATION) \ + template class Single_##OPERATION##_Kernel { \ + public: \ + Single_##OPERATION##_Kernel(FunctorType functor, LocalMemory localAccess) \ + : functor(functor), localAccess(localAccess) {} \ + void operator()(sycl::nd_item<3> i) const { \ + FunctorType my_functor = functor; \ + Byte *shared_memory = localAccess.get_pointer().get(); \ + my_functor.Init(i.get_group_range(2), i.get_group_range(1), \ + i.get_group_range(0), \ + i.get_global_range(2) / i.get_group_range(2), \ + i.get_global_range(1) / i.get_group_range(1), \ + i.get_global_range(0) / i.get_group_range(0), \ + i.get_group().get_id(2), i.get_group().get_id(1), \ + i.get_group().get_id(0), i.get_local_id(2), \ + i.get_local_id(1), i.get_local_id(0), shared_memory); \ + my_functor.OPERATION(); \ + i.barrier(); \ + } \ + \ + private: \ + FunctorType functor; \ + LocalMemory localAccess; \ + }; + +SINGLE_KERNEL(Operation1); +SINGLE_KERNEL(Operation2); +SINGLE_KERNEL(Operation3); +SINGLE_KERNEL(Operation4); +SINGLE_KERNEL(Operation5); +SINGLE_KERNEL(Operation6); +SINGLE_KERNEL(Operation7); +SINGLE_KERNEL(Operation8); +SINGLE_KERNEL(Operation9); +SINGLE_KERNEL(Operation10); +SINGLE_KERNEL(Operation11); +SINGLE_KERNEL(Operation12); +SINGLE_KERNEL(Operation13); +SINGLE_KERNEL(Operation14); + +#undef SINGLE_KERNEL + +template class ParallelMergeKernel { +public: + ParallelMergeKernel(FunctorType functor, LocalMemory localAccess) + : functor(functor), localAccess(localAccess) {} + void operator()(sycl::nd_item<3> i) const { + FunctorType my_functor = functor; + Byte *shared_memory = localAccess.get_pointer().get(); + my_functor.Init( + i.get_group_range(2), i.get_group_range(1), i.get_group_range(0), + i.get_global_range(2) / i.get_group_range(2), + i.get_global_range(1) / i.get_group_range(1), + i.get_global_range(0) / i.get_group_range(0), i.get_group().get_id(2), + i.get_group().get_id(1), i.get_group().get_id(0), i.get_local_id(2), + i.get_local_id(1), i.get_local_id(0), shared_memory); + + my_functor.Operation5(); + i.barrier(); + while (my_functor.LoopCondition2()) { + my_functor.Operation6(); + i.barrier(); + my_functor.Operation7(); + i.barrier(); + my_functor.Operation8(); + i.barrier(); + } + my_functor.Operation9(); + } + +private: + FunctorType functor; + LocalMemory localAccess; +}; + +template void HuffmanCLCustomizedNoCGKernel(Task task) { + // std::cout << "calling HuffmanCLCustomizedNoCGKernel\n"; + sycl::range global_threads(task.GetBlockDimX() * task.GetGridDimX(), + task.GetBlockDimY() * task.GetGridDimY(), + task.GetBlockDimZ() * task.GetGridDimZ()); + + sycl::range local_threads(task.GetBlockDimX(), task.GetBlockDimY(), + task.GetBlockDimZ()); + + size_t sm_size = task.GetSharedMemorySize(); + if (sm_size == 0) + sm_size = 1; // avoid -51 (CL_INVALID_ARG_SIZE) error + + sycl::queue q = DeviceRuntime::GetQueue(task.GetQueueIdx()); + + // std::cout << "calling Single_Operation1_Kernel\n"; + q.submit([&](sycl::handler &h) { + LocalMemory localAccess{sm_size, h}; + Single_Operation1_Kernel kernel(task.GetFunctor(), localAccess); + h.parallel_for(sycl::nd_range{global_threads, local_threads}, kernel); + }); + DeviceRuntime::SyncQueue(task.GetQueueIdx()); + + // std::cout << "calling LoopCondition1\n"; + while (task.GetFunctor().LoopCondition1()) { + DeviceRuntime::SyncQueue(task.GetQueueIdx()); + + // std::cout << "calling Single_Operation2_Kernel\n"; + q.submit([&](sycl::handler &h) { + LocalMemory localAccess{sm_size, h}; + Single_Operation2_Kernel kernel(task.GetFunctor(), localAccess); + h.parallel_for(sycl::nd_range{global_threads, local_threads}, kernel); + }); + DeviceRuntime::SyncQueue(task.GetQueueIdx()); + + // std::cout << "calling Single_Operation3_Kernel\n"; + q.submit([&](sycl::handler &h) { + LocalMemory localAccess{sm_size, h}; + Single_Operation3_Kernel kernel(task.GetFunctor(), localAccess); + h.parallel_for(sycl::nd_range{global_threads, local_threads}, kernel); + }); + DeviceRuntime::SyncQueue(task.GetQueueIdx()); + + // std::cout << "calling Single_Operation4_Kernel\n"; + q.submit([&](sycl::handler &h) { + LocalMemory localAccess{sm_size, h}; + Single_Operation4_Kernel kernel(task.GetFunctor(), localAccess); + h.parallel_for(sycl::nd_range{global_threads, local_threads}, kernel); + }); + DeviceRuntime::SyncQueue(task.GetQueueIdx()); + + // std::cout << "calling BranchCondition1\n"; + if (task.GetFunctor().BranchCondition1()) { + DeviceRuntime::SyncQueue(task.GetQueueIdx()); + + // std::cout << "calling ParallelMergeKernel\n"; + q.submit([&](sycl::handler &h) { + LocalMemory localAccess{sm_size, h}; + ParallelMergeKernel kernel(task.GetFunctor(), localAccess); + h.parallel_for(sycl::nd_range{global_threads, local_threads}, kernel); + }); + DeviceRuntime::SyncQueue(task.GetQueueIdx()); + + // std::cout << "calling Single_Operation10_Kernel\n"; + q.submit([&](sycl::handler &h) { + LocalMemory localAccess{sm_size, h}; + Single_Operation10_Kernel kernel(task.GetFunctor(), localAccess); + h.parallel_for(sycl::nd_range{global_threads, local_threads}, kernel); + }); + DeviceRuntime::SyncQueue(task.GetQueueIdx()); + } + + // std::cout << "calling Single_Operation11_Kernel\n"; + q.submit([&](sycl::handler &h) { + LocalMemory localAccess{sm_size, h}; + Single_Operation11_Kernel kernel(task.GetFunctor(), localAccess); + h.parallel_for(sycl::nd_range{global_threads, local_threads}, kernel); + }); + DeviceRuntime::SyncQueue(task.GetQueueIdx()); + + // std::cout << "calling Single_Operation12_Kernel\n"; + q.submit([&](sycl::handler &h) { + LocalMemory localAccess{sm_size, h}; + Single_Operation12_Kernel kernel(task.GetFunctor(), localAccess); + h.parallel_for(sycl::nd_range{global_threads, local_threads}, kernel); + }); + DeviceRuntime::SyncQueue(task.GetQueueIdx()); + + // std::cout << "calling Single_Operation13_Kernel\n"; + q.submit([&](sycl::handler &h) { + LocalMemory localAccess{sm_size, h}; + Single_Operation13_Kernel kernel(task.GetFunctor(), localAccess); + h.parallel_for(sycl::nd_range{global_threads, local_threads}, kernel); + }); + DeviceRuntime::SyncQueue(task.GetQueueIdx()); + // std::cout << "calling Single_Operation14_Kernel\n"; + q.submit([&](sycl::handler &h) { + LocalMemory localAccess{sm_size, h}; + Single_Operation14_Kernel kernel(task.GetFunctor(), localAccess); + h.parallel_for(sycl::nd_range{global_threads, local_threads}, kernel); + }); + DeviceRuntime::SyncQueue(task.GetQueueIdx()); + } +} + +template void HuffmanCWCustomizedNoCGKernel(Task task) { + // std::cout << "calling HuffmanCWCustomizedNoCGKernel\n"; + sycl::range global_threads(task.GetBlockDimX() * task.GetGridDimX(), + task.GetBlockDimY() * task.GetGridDimY(), + task.GetBlockDimZ() * task.GetGridDimZ()); + + sycl::range local_threads(task.GetBlockDimX(), task.GetBlockDimY(), + task.GetBlockDimZ()); + + size_t sm_size = task.GetSharedMemorySize(); + if (sm_size == 0) + sm_size = 1; // avoid -51 (CL_INVALID_ARG_SIZE) error + + sycl::queue q = DeviceRuntime::GetQueue(task.GetQueueIdx()); + + // std::cout << "calling Single_Operation1_Kernel\n"; + q.submit([&](sycl::handler &h) { + LocalMemory localAccess{sm_size, h}; + Single_Operation1_Kernel kernel(task.GetFunctor(), localAccess); + h.parallel_for(sycl::nd_range{global_threads, local_threads}, kernel); + }); + DeviceRuntime::SyncQueue(task.GetQueueIdx()); + // std::cout << "calling Single_Operation2_Kernel\n"; + q.submit([&](sycl::handler &h) { + LocalMemory localAccess{sm_size, h}; + Single_Operation2_Kernel kernel(task.GetFunctor(), localAccess); + h.parallel_for(sycl::nd_range{global_threads, local_threads}, kernel); + }); + DeviceRuntime::SyncQueue(task.GetQueueIdx()); + // std::cout << "calling Single_Operation3_Kernel\n"; + q.submit([&](sycl::handler &h) { + LocalMemory localAccess{sm_size, h}; + Single_Operation3_Kernel kernel(task.GetFunctor(), localAccess); + h.parallel_for(sycl::nd_range{global_threads, local_threads}, kernel); + }); + DeviceRuntime::SyncQueue(task.GetQueueIdx()); + + // std::cout << "calling LoopCondition1\n"; + while (task.GetFunctor().LoopCondition1()) { + DeviceRuntime::SyncQueue(task.GetQueueIdx()); + + // std::cout << "calling Single_Operation4_Kernel\n"; + q.submit([&](sycl::handler &h) { + LocalMemory localAccess{sm_size, h}; + Single_Operation4_Kernel kernel(task.GetFunctor(), localAccess); + h.parallel_for(sycl::nd_range{global_threads, local_threads}, kernel); + }); + DeviceRuntime::SyncQueue(task.GetQueueIdx()); + + // std::cout << "calling Single_Operation5_Kernel\n"; + q.submit([&](sycl::handler &h) { + LocalMemory localAccess{sm_size, h}; + Single_Operation5_Kernel kernel(task.GetFunctor(), localAccess); + h.parallel_for(sycl::nd_range{global_threads, local_threads}, kernel); + }); + DeviceRuntime::SyncQueue(task.GetQueueIdx()); + + // std::cout << "calling Single_Operation6_Kernel\n"; + q.submit([&](sycl::handler &h) { + LocalMemory localAccess{sm_size, h}; + Single_Operation6_Kernel kernel(task.GetFunctor(), localAccess); + h.parallel_for(sycl::nd_range{global_threads, local_threads}, kernel); + }); + DeviceRuntime::SyncQueue(task.GetQueueIdx()); + + // std::cout << "calling Single_Operation7_Kernel\n"; + q.submit([&](sycl::handler &h) { + LocalMemory localAccess{sm_size, h}; + Single_Operation7_Kernel kernel(task.GetFunctor(), localAccess); + h.parallel_for(sycl::nd_range{global_threads, local_threads}, kernel); + }); + DeviceRuntime::SyncQueue(task.GetQueueIdx()); + + // std::cout << "calling Single_Operation8_Kernel\n"; + q.submit([&](sycl::handler &h) { + LocalMemory localAccess{sm_size, h}; + Single_Operation8_Kernel kernel(task.GetFunctor(), localAccess); + h.parallel_for(sycl::nd_range{global_threads, local_threads}, kernel); + }); + DeviceRuntime::SyncQueue(task.GetQueueIdx()); + } + + // std::cout << "calling Single_Operation9_Kernel\n"; + q.submit([&](sycl::handler &h) { + LocalMemory localAccess{sm_size, h}; + Single_Operation9_Kernel kernel(task.GetFunctor(), localAccess); + h.parallel_for(sycl::nd_range{global_threads, local_threads}, kernel); + }); + DeviceRuntime::SyncQueue(task.GetQueueIdx()); + + // std::cout << "calling Single_Operation10_Kernel\n"; + q.submit([&](sycl::handler &h) { + LocalMemory localAccess{sm_size, h}; + Single_Operation10_Kernel kernel(task.GetFunctor(), localAccess); + h.parallel_for(sycl::nd_range{global_threads, local_threads}, kernel); + }); + DeviceRuntime::SyncQueue(task.GetQueueIdx()); +} + +// template class DeviceAdapter { +public: + // inline constexpr bool sycl::is_device_copyable_v = true; + + MGARDX_CONT + DeviceAdapter(){}; + + MGARDX_CONT + int IsResourceEnough(TaskType &task) { + if (task.GetBlockDimX() * task.GetBlockDimY() * task.GetBlockDimZ() > + DeviceRuntime::GetMaxNumThreadsPerTB()) { + return THREADBLOCK_TOO_LARGE; + } + if (task.GetSharedMemorySize() > + DeviceRuntime::GetMaxSharedMemorySize()) { + return SHARED_MEMORY_TOO_LARGE; + } + return RESOURCE_ENOUGH; + } + + MGARDX_CONT + ExecutionReturn Execute(TaskType &task) { + + sycl::range global_threads(task.GetBlockDimX() * task.GetGridDimX(), + task.GetBlockDimY() * task.GetGridDimY(), + task.GetBlockDimZ() * task.GetGridDimZ()); + + sycl::range local_threads(task.GetBlockDimX(), task.GetBlockDimY(), + task.GetBlockDimZ()); + + size_t sm_size = task.GetSharedMemorySize(); + if (sm_size == 0) + sm_size = 1; // avoid -51 (CL_INVALID_ARG_SIZE) error + + sycl::queue q = DeviceRuntime::GetQueue(task.GetQueueIdx()); + + if (DeviceRuntime::PrintKernelConfig) { + std::cout << log::log_info << task.GetFunctorName() << ": <" + << task.GetBlockDimX() << ", " << task.GetBlockDimY() << ", " + << task.GetBlockDimZ() << "> <" << task.GetGridDimX() << ", " + << task.GetGridDimY() << ", " << task.GetGridDimZ() << ">\n"; + } + + ExecutionReturn ret; + if (IsResourceEnough(task) != RESOURCE_ENOUGH) { + if (DeviceRuntime::PrintKernelConfig) { + if (IsResourceEnough(task) == THREADBLOCK_TOO_LARGE) { + std::cout << log::log_info << "threadblock too large.\n"; + } + if (IsResourceEnough(task) == SHARED_MEMORY_TOO_LARGE) { + std::cout << log::log_info << "shared memory too large.\n"; + } + } + ret.success = false; + ret.execution_time = std::numeric_limits::max(); + return ret; + } + + Timer timer; + if (DeviceRuntime::TimingAllKernels || + AutoTuner::ProfileKernels) { + DeviceRuntime::SyncDevice(); + timer.start(); + } + + // if constexpr evaluate at compile time otherwise this does not compile + if constexpr (std::is_base_of, + typename TaskType::Functor>::value) { + q.submit([&](sycl::handler &h) { + LocalMemory localAccess{sm_size, h}; + Kernel kernel(task.GetFunctor(), localAccess); + h.parallel_for(sycl::nd_range{global_threads, local_threads}, kernel); + }); + } else if constexpr (std::is_base_of, + typename TaskType::Functor>::value) { + q.submit([&](sycl::handler &h) { + LocalMemory localAccess{sm_size, h}; + IterKernel kernel(task.GetFunctor(), localAccess); + h.parallel_for(sycl::nd_range{global_threads, local_threads}, kernel); + }); + } else if constexpr (std::is_base_of, + typename TaskType::Functor>::value) { + HuffmanCLCustomizedNoCGKernel(task); + } else if constexpr (std::is_base_of, + typename TaskType::Functor>::value) { + HuffmanCWCustomizedNoCGKernel(task); + } + if (DeviceRuntime::SyncAllKernelsAndCheckErrors) { + DeviceRuntime::SyncQueue(task.GetQueueIdx()); + } + + if (DeviceRuntime::TimingAllKernels || + AutoTuner::ProfileKernels) { + DeviceRuntime::SyncDevice(); + timer.end(); + if (DeviceRuntime::TimingAllKernels) { + timer.print(task.GetFunctorName()); + } + if (AutoTuner::ProfileKernels) { + ret.success = true; + ret.execution_time = timer.get(); + } + } + return ret; + } +}; + +template struct AbsMaxOp { + T operator()(const T &a, const T &b) const { + return (fabs(b) > fabs(a)) ? fabs(b) : fabs(a); + } +}; + +template struct SquareOp { + T operator()(const T &a) const { return a * a; } +}; + +template <> class DeviceCollective { +public: + MGARDX_CONT + DeviceCollective(){}; + + template + MGARDX_CONT static void Sum(SIZE n, SubArray<1, T, SYCL> &v, + SubArray<1, T, SYCL> &result, int queue_idx) { + + sycl::queue q = DeviceRuntime::GetQueue(queue_idx); + q.submit([&](sycl::handler &h) { + T *res = result.data(); + T *input = v.data(); + sycl::range global{n}; + sycl::range local{256}; + h.parallel_for(sycl::nd_range{global, local}, + sycl::reduction(res, (T)0, sycl::plus()), + [=](sycl::nd_item<1> it, auto &res) { + size_t i = it.get_global_id(0); + res.combine(input[i]); + }); + }); + DeviceRuntime::SyncDevice(); + } + + template + MGARDX_CONT static void AbsMax(SIZE n, SubArray<1, T, SYCL> &v, + SubArray<1, T, SYCL> &result, int queue_idx) { + sycl::queue q = DeviceRuntime::GetQueue(queue_idx); + q.submit([&](sycl::handler &h) { + T *res = result.data(); + T *input = v.data(); + sycl::range global{n}; + sycl::range local{4}; + h.parallel_for(sycl::nd_range{global, local}, + sycl::reduction(res, (T)0, AbsMaxOp()), + [=](sycl::nd_item<1> it, auto &res) { + size_t i = it.get_global_id(0); + res.combine(input[i]); + }); + }); + DeviceRuntime::SyncDevice(); + } + + template + MGARDX_CONT static void SquareSum(SIZE n, SubArray<1, T, SYCL> &v, + SubArray<1, T, SYCL> &result, + int queue_idx) { + sycl::queue q = DeviceRuntime::GetQueue(queue_idx); + q.submit([&](sycl::handler &h) { + T *res = result.data(); + T *input = v.data(); + sycl::range global{n}; + sycl::range local{256}; + h.parallel_for(sycl::nd_range{global, local}, + sycl::reduction(res, (T)0, sycl::plus()), + [=](sycl::nd_item<1> it, auto &res) { + size_t i = it.get_global_id(0); + res.combine(input[i] * input[i]); + }); + }); + DeviceRuntime::SyncDevice(); + } + + template + MGARDX_CONT static void ScanSumInclusive(SIZE n, SubArray<1, T, SYCL> &v, + SubArray<1, T, SYCL> &result, + int queue_idx) {} + + template + MGARDX_CONT static void ScanSumExclusive(SIZE n, SubArray<1, T, SYCL> &v, + SubArray<1, T, SYCL> &result, + int queue_idx) {} + + template + MGARDX_CONT static void ScanSumExtended(SIZE n, SubArray<1, T, SYCL> &v, + SubArray<1, T, SYCL> &result, + int queue_idx) {} + + template + MGARDX_CONT static void SortByKey(SIZE n, SubArray<1, KeyT, SYCL> &keys, + SubArray<1, ValueT, SYCL> &values, + int queue_idx) { + KeyT *keys_array = new KeyT[n]; + ValueT *values_array = new ValueT[n]; + MemoryManager::Copy1D(keys_array, keys.data(), n, 0); + MemoryManager::Copy1D(values_array, values.data(), n, 0); + DeviceRuntime::SyncQueue(0); + + std::vector> data(n); + for (SIZE i = 0; i < n; ++i) { + data[i] = std::pair(keys_array[i], values_array[i]); + } + std::stable_sort(data.begin(), data.end(), + KeyValueComparator{}); + for (SIZE i = 0; i < n; ++i) { + keys_array[i] = data[i].first; + values_array[i] = data[i].second; + } + MemoryManager::Copy1D(keys.data(), keys_array, n, 0); + MemoryManager::Copy1D(values.data(), values_array, n, 0); + DeviceRuntime::SyncDevice(); + delete[] keys_array; + delete[] values_array; + } + + template + MGARDX_CONT static void + ScanOpInclusiveByKey(SubArray<1, SIZE, SYCL> &key, + SubArray<1, ValueT, SYCL> &v, + SubArray<1, ValueT, SYCL> &result, int queue_idx) {} +}; + +} // namespace mgard_x +#endif \ No newline at end of file diff --git a/include/mgard-x/RuntimeX/RuntimeX.h b/include/mgard-x/RuntimeX/RuntimeX.h index e22f930c42..e0b575145b 100644 --- a/include/mgard-x/RuntimeX/RuntimeX.h +++ b/include/mgard-x/RuntimeX/RuntimeX.h @@ -14,11 +14,8 @@ #include "AutoTuners/AutoTuner.h" #include "Tasks/Task.h" -#if MGARD_ENABLE_SERIAL -#ifdef MGARDX_COMPILE_SERIAL +// Serial backend should be always available #include "DeviceAdapters/DeviceAdapterSerial.h" -#endif -#endif #if MGARD_ENABLE_CUDA #ifdef MGARDX_COMPILE_CUDA @@ -32,6 +29,12 @@ #endif #endif +#if MGARD_ENABLE_SYCL +#ifdef MGARDX_COMPILE_SYCL +#include "DeviceAdapters/DeviceAdapterSycl.h" +#endif +#endif + #if RUNTIME_X_ENABLE_KOKKOS #include "DeviceAdapters/DeviceAdapterKokkos.h" #endif diff --git a/include/mgard-x/RuntimeX/Utilities/SubArrayPrinter.hpp b/include/mgard-x/RuntimeX/Utilities/SubArrayPrinter.hpp index e4c5717260..02e42a4456 100644 --- a/include/mgard-x/RuntimeX/Utilities/SubArrayPrinter.hpp +++ b/include/mgard-x/RuntimeX/Utilities/SubArrayPrinter.hpp @@ -53,6 +53,7 @@ void PrintSubarray(std::string name, SubArrayType subArray) { // ncol, subArray.data(), subArray.lddv1 * sizeof(T), // nfib * sizeof(T), subArray.lddv2, nfib * sizeof(T), // ncol, nrow, D2H, 0); + DeviceRuntime::SyncQueue(0); for (SIZE i = 0; i < nrow; i++) { MemoryManager::CopyND( v + ncol * nfib * i, nfib, @@ -251,6 +252,48 @@ void CompareSubarray(std::string name, SubArrayType1 subArray1, delete[] v2; } +template +void CompareSubarray4D(SubArrayType subArray1, SubArrayType subArray2) { + if (SubArrayType::NumDims != 4) { + std::cout << log::log_err + << "CompareSubarray4D expects 4D subarray type.\n"; + exit(-1); + } + if (subArray1.getShape(3) != subArray2.getShape(3)) { + std::cout << log::log_err << "CompareSubarray4D mismatch 4D size.\n"; + exit(-1); + } + + using T = typename SubArrayType::DataType; + SIZE idx[4] = {0, 0, 0, 0}; + for (SIZE i = 0; i < subArray1.getShape(3); i++) { + idx[3] = i; + SubArrayType temp1 = subArray1; + SubArrayType temp2 = subArray2; + temp1.offset(3, i); + temp2.offset(3, i); + CompareSubarray("4D = " + std::to_string(i), temp1.Slice3D(0, 1, 2), + temp2.Slice3D(0, 1, 2)); + } +} + +template +void PrintSubarray4D(std::string name, SubArrayType subArray1) { + if (SubArrayType::NumDims != 4) { + std::cout << log::log_err << "PrintSubarray4D expects 4D subarray type.\n"; + exit(-1); + } + std::cout << name << "\n"; + using T = typename SubArrayType::DataType; + SIZE idx[4] = {0, 0, 0, 0}; + for (SIZE i = 0; i < subArray1.getShape(3); i++) { + idx[3] = i; + SubArrayType temp1 = subArray1; + temp1.offset(3, i); + PrintSubarray("i = " + std::to_string(i), temp1.Slice3D(0, 1, 2)); + } +} + // print 3D CPU template void verify_matrix(SIZE nrow, SIZE ncol, SIZE nfib, T *v, SIZE ldv1, SIZE ldv2, diff --git a/include/mgard-x/Testing/ReorderToolsCPU.h b/include/mgard-x/Testing/ReorderToolsCPU.h deleted file mode 100644 index a047b3166d..0000000000 --- a/include/mgard-x/Testing/ReorderToolsCPU.h +++ /dev/null @@ -1,24 +0,0 @@ -#ifndef REORDERTOOLSCPU_H -#define REORDERTOOLSCPU_H - -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "TensorMeshHierarchy.hpp" -#include "shuffle.hpp" - -namespace mgard { - -template -void ReorderCPU(TensorMeshHierarchy &hierarchy, T *input, T *output); -template -void ReverseReorderCPU(TensorMeshHierarchy &hierarchy, T *input, - T *output); - -} // namespace mgard - -#endif \ No newline at end of file diff --git a/include/mgard-x/Testing/ReorderToolsCPU.hpp b/include/mgard-x/Testing/ReorderToolsCPU.hpp deleted file mode 100644 index 11777d0470..0000000000 --- a/include/mgard-x/Testing/ReorderToolsCPU.hpp +++ /dev/null @@ -1,42 +0,0 @@ -#ifndef REORDERTOOLSGPU_HPP -#define REORDERTOOLSGPU_HPP - -#include "TensorMeshHierarchy.hpp" -#include "shuffle.hpp" - -namespace mgard { - -template -void ReorderCPU(TensorMeshHierarchy &hierarchy, T *input, T *output) { - shuffle(hierarchy, input, output); -} - -template -void ReverseReorderCPU(TensorMeshHierarchy &hierarchy, T *input, - T *output) { - unshuffle(hierarchy, input, output); -} -} // namespace mgard - -namespace mgard { -#define KERNELS(D, T) \ - template void ReorderCPU(TensorMeshHierarchy & hierarchy, \ - T * input, T * output); \ - template void ReverseReorderCPU(TensorMeshHierarchy & hierarchy, \ - T * input, T * output); - -KERNELS(1, double) -KERNELS(1, float) -KERNELS(2, double) -KERNELS(2, float) -KERNELS(3, double) -KERNELS(3, float) -KERNELS(4, double) -KERNELS(4, float) -KERNELS(5, double) -KERNELS(5, float) - -#undef KERNELS -} // namespace mgard - -#endif \ No newline at end of file diff --git a/include/mgard-x/Testing/ReorderToolsGPU.h b/include/mgard-x/Testing/ReorderToolsGPU.h deleted file mode 100644 index 6b4220c411..0000000000 --- a/include/mgard-x/Testing/ReorderToolsGPU.h +++ /dev/null @@ -1,18 +0,0 @@ -#ifndef REORDERTOOLSGPU_H -#define REORDERTOOLSGPU_H - -#include "../Common.h" - -namespace mgard_x { - -template -void ReorderGPU(Handle &handle, SubArray dinput, - SubArray &doutput, int l_target, int queue_idx); -template -void ReverseReorderGPU(Handle &handle, SubArray dinput, - SubArray &doutput, int l_target, - int queue_idx); - -} // namespace mgard_x - -#endif \ No newline at end of file diff --git a/include/mgard-x/Testing/ReorderToolsGPU.hpp b/include/mgard-x/Testing/ReorderToolsGPU.hpp deleted file mode 100644 index a0d9fec569..0000000000 --- a/include/mgard-x/Testing/ReorderToolsGPU.hpp +++ /dev/null @@ -1,337 +0,0 @@ -#ifndef REORDERTOOLSGPU_HPP -#define REORDERTOOLSGPU_HPP - -#include "../DataRefactoring.h" -#include "../GridProcessingKernel.h" -#include "../GridProcessingKernel3D.h" -#include "../LevelwiseProcessingKernel.h" -#include "ReorderToolsGPU.h" - -#include "../CommonInternal.h" - -namespace mgard_x { - -template -void ReorderGPU(Handle &handle, SubArray dinput, - SubArray &doutput, int l_target, int queue_idx) { - - SubArray dcoarse, dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, - dcoeff_rf, dcoeff_rc, dcoeff_rcf; - - DIM curr_dims[3]; - // handle.l_target = 1; - for (int l = 0; l < l_target; ++l) { - int range_l = std::min(6, (int)std::log2(handle.dofs[0][l]) - 1); - int range_lp1 = std::min(6, (int)std::log2(handle.dofs[0][l + 1]) - 1); - int unprocessed_idx = 0; - printf("reorder 1-3D\n"); - curr_dims[0] = 0; - curr_dims[1] = 1; - curr_dims[2] = 2; - dinput.project(curr_dims[0], curr_dims[1], curr_dims[2]); - doutput.project(curr_dims[0], curr_dims[1], - curr_dims[2]); // reuse input1 as temp space - calc_coeff_pointers(handle, curr_dims, l, doutput, dcoarse, dcoeff_f, - dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc, - dcoeff_rcf); - printf("done calc ptrs\n"); - if (D <= 3) { - gpk_reo( - handle, handle.shapes_h[l], handle.shapes_d[l], - handle.shapes_d[l + 1], dinput.ldvs_d, doutput.ldvs_d, - handle.unprocessed_n[unprocessed_idx], - handle.unprocessed_dims_d[unprocessed_idx], curr_dims[2], - curr_dims[1], curr_dims[0], handle.ratio[curr_dims[2]][l], - handle.ratio[curr_dims[1]][l], handle.ratio[curr_dims[0]][l], - dinput.dv, dinput.lddv1, dinput.lddv2, dcoarse.dv, dcoarse.lddv1, - dcoarse.lddv2, - // null, lddv1, lddv2, - dcoeff_f.dv, dcoeff_f.lddv1, dcoeff_f.lddv2, - // null, lddv1, lddv2, - dcoeff_c.dv, dcoeff_c.lddv1, dcoeff_c.lddv2, - // null, lddv1, lddv2, - dcoeff_r.dv, dcoeff_r.lddv1, dcoeff_r.lddv2, - // null, lddv1, lddv2, - dcoeff_cf.dv, dcoeff_cf.lddv1, dcoeff_cf.lddv2, - // null, lddv1, lddv2, - dcoeff_rf.dv, dcoeff_rf.lddv1, dcoeff_rf.lddv2, - // null, lddv1, lddv2, - dcoeff_rc.dv, dcoeff_rc.lddv1, dcoeff_rc.lddv2, - // null, lddv1, lddv2, - dcoeff_rcf.dv, dcoeff_rcf.lddv1, dcoeff_rcf.lddv2, - // null, lddv1, lddv2, - queue_idx, - handle.auto_tuning_cc[handle.arch][handle.precision][range_l]); - printf("done reo\n"); - } else { - gpk_reo( - handle, handle.shapes_h[l], handle.shapes_d[l], - handle.shapes_d[l + 1], dinput.ldvs_d, doutput.ldvs_d, - handle.unprocessed_n[unprocessed_idx], - handle.unprocessed_dims_d[unprocessed_idx], curr_dims[2], - curr_dims[1], curr_dims[0], handle.ratio[curr_dims[2]][l], - handle.ratio[curr_dims[1]][l], handle.ratio[curr_dims[0]][l], - dinput.dv, dinput.lddv1, dinput.lddv2, dcoarse.dv, dcoarse.lddv1, - dcoarse.lddv2, - // null, lddv1, lddv2, - dcoeff_f.dv, dcoeff_f.lddv1, dcoeff_f.lddv2, - // null, lddv1, lddv2, - dcoeff_c.dv, dcoeff_c.lddv1, dcoeff_c.lddv2, - // null, lddv1, lddv2, - dcoeff_r.dv, dcoeff_r.lddv1, dcoeff_r.lddv2, - // null, lddv1, lddv2, - dcoeff_cf.dv, dcoeff_cf.lddv1, dcoeff_cf.lddv2, - // null, lddv1, lddv2, - dcoeff_rf.dv, dcoeff_rf.lddv1, dcoeff_rf.lddv2, - // null, lddv1, lddv2, - dcoeff_rc.dv, dcoeff_rc.lddv1, dcoeff_rc.lddv2, - // null, lddv1, lddv2, - dcoeff_rcf.dv, dcoeff_rcf.lddv1, dcoeff_rcf.lddv2, - // null, lddv1, lddv2, - queue_idx, - handle.auto_tuning_cc[handle.arch][handle.precision][range_l]); - - for (DIM d = 3; d < D; d += 2) { - // copy back to input for reordering again - lwpk(handle, handle.shapes_h[l], handle.shapes_d[l], - doutput.dv, doutput.ldvs_d, dinput.dv, dinput.ldvs_d, - queue_idx); - printf("reorder-restore %u-%uD\n", d + 1, d + 2); - curr_dims[0] = 0; - curr_dims[1] = d; - curr_dims[2] = d + 1; - dinput.project(curr_dims[0], curr_dims[1], curr_dims[2]); - doutput.project(curr_dims[0], curr_dims[1], - curr_dims[2]); // reuse input1 as temp space - calc_coeff_pointers(handle, curr_dims, l, doutput, dcoarse, dcoeff_f, - dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc, - dcoeff_rcf); - - if (D - d == 1) { - unprocessed_idx += 1; - gpk_reo( - handle, handle.shapes_h[l], handle.shapes_d[l], - handle.shapes_d[l + 1], dinput.ldvs_d, doutput.ldvs_d, - handle.unprocessed_n[unprocessed_idx], - handle.unprocessed_dims_d[unprocessed_idx], curr_dims[2], - curr_dims[1], curr_dims[0], handle.ratio[curr_dims[2]][l], - handle.ratio[curr_dims[1]][l], handle.ratio[curr_dims[0]][l], - dinput.dv, dinput.lddv1, dinput.lddv2, dcoarse.dv, dcoarse.lddv1, - dcoarse.lddv2, - // null, lddv1, lddv2, - dcoeff_f.dv, dcoeff_f.lddv1, dcoeff_f.lddv2, - // null, lddv1, lddv2, - dcoeff_c.dv, dcoeff_c.lddv1, dcoeff_c.lddv2, - // null, lddv1, lddv2, - dcoeff_r.dv, dcoeff_r.lddv1, dcoeff_r.lddv2, - // null, lddv1, lddv2, - dcoeff_cf.dv, dcoeff_cf.lddv1, dcoeff_cf.lddv2, - // null, lddv1, lddv2, - dcoeff_rf.dv, dcoeff_rf.lddv1, dcoeff_rf.lddv2, - // null, lddv1, lddv2, - dcoeff_rc.dv, dcoeff_rc.lddv1, dcoeff_rc.lddv2, - // null, lddv1, lddv2, - dcoeff_rcf.dv, dcoeff_rcf.lddv1, dcoeff_rcf.lddv2, - // null, lddv1, lddv2, - queue_idx, - handle.auto_tuning_cc[handle.arch][handle.precision][range_l]); - } else { - unprocessed_idx += 2; - gpk_reo( - handle, handle.shapes_h[l], handle.shapes_d[l], - handle.shapes_d[l + 1], dinput.ldvs_d, doutput.ldvs_d, - handle.unprocessed_n[unprocessed_idx], - handle.unprocessed_dims_d[unprocessed_idx], curr_dims[2], - curr_dims[1], curr_dims[0], handle.ratio[curr_dims[2]][l], - handle.ratio[curr_dims[1]][l], handle.ratio[curr_dims[0]][l], - dinput.dv, dinput.lddv1, dinput.lddv2, dcoarse.dv, dcoarse.lddv1, - dcoarse.lddv2, - // null, lddv1, lddv2, - dcoeff_f.dv, dcoeff_f.lddv1, dcoeff_f.lddv2, - // null, lddv1, lddv2, - dcoeff_c.dv, dcoeff_c.lddv1, dcoeff_c.lddv2, - // null, lddv1, lddv2, - dcoeff_r.dv, dcoeff_r.lddv1, dcoeff_r.lddv2, - // null, lddv1, lddv2, - dcoeff_cf.dv, dcoeff_cf.lddv1, dcoeff_cf.lddv2, - // null, lddv1, lddv2, - dcoeff_rf.dv, dcoeff_rf.lddv1, dcoeff_rf.lddv2, - // null, lddv1, lddv2, - dcoeff_rc.dv, dcoeff_rc.lddv1, dcoeff_rc.lddv2, - // null, lddv1, lddv2, - dcoeff_rcf.dv, dcoeff_rcf.lddv1, dcoeff_rcf.lddv2, - // null, lddv1, lddv2, - queue_idx, - handle.auto_tuning_cc[handle.arch][handle.precision][range_l]); - } - } - } - } -} - -template -void ReverseReorderGPU(Handle &handle, SubArray dinput, - SubArray &doutput, int l_target, - int queue_idx) { - - SubArray dcoarse, dcoeff_f, dcoeff_c, dcoeff_r, dcoeff_cf, - dcoeff_rf, dcoeff_rc, dcoeff_rcf; - - DIM curr_dims[3]; - for (int l = 0; l < l_target; ++l) { - int range_l = std::min(6, (int)std::log2(handle.dofs[0][l]) - 1); - int range_lp1 = std::min(6, (int)std::log2(handle.dofs[0][l + 1]) - 1); - int unprocessed_idx = 0; - printf("reorder-restore 1-3D\n"); - curr_dims[0] = 0; - curr_dims[1] = 1; - curr_dims[2] = 2; - dinput.project(curr_dims[0], curr_dims[1], curr_dims[2]); - doutput.project(curr_dims[0], curr_dims[1], - curr_dims[2]); // reuse input1 as temp space - calc_coeff_pointers(handle, curr_dims, l, dinput, dcoarse, dcoeff_f, - dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc, - dcoeff_rcf); - if (D <= 3) { - gpk_rev( - handle, handle.shapes_h[l], handle.shapes_d[l], - handle.shapes_d[l + 1], doutput.ldvs_d, dinput.ldvs_d, - handle.unprocessed_n[unprocessed_idx], - handle.unprocessed_dims_d[unprocessed_idx], curr_dims[2], - curr_dims[1], curr_dims[0], handle.ratio[curr_dims[2]][l], - handle.ratio[curr_dims[1]][l], handle.ratio[curr_dims[0]][l], - doutput.dv, doutput.lddv1, doutput.lddv2, dcoarse.dv, dcoarse.lddv1, - dcoarse.lddv2, - // null, lddv1, lddv2, - dcoeff_f.dv, dcoeff_f.lddv1, dcoeff_f.lddv2, - // null, lddv1, lddv2, - dcoeff_c.dv, dcoeff_c.lddv1, dcoeff_c.lddv2, - // null, lddv1, lddv2, - dcoeff_r.dv, dcoeff_r.lddv1, dcoeff_r.lddv2, - // null, lddv1, lddv2, - dcoeff_cf.dv, dcoeff_cf.lddv1, dcoeff_cf.lddv2, - // null, lddv1, lddv2, - dcoeff_rf.dv, dcoeff_rf.lddv1, dcoeff_rf.lddv2, - // null, lddv1, lddv2, - dcoeff_rc.dv, dcoeff_rc.lddv1, dcoeff_rc.lddv2, - // null, lddv1, lddv2, - dcoeff_rcf.dv, dcoeff_rcf.lddv1, dcoeff_rcf.lddv2, - // null, lddv1, lddv2, - 0, 0, 0, handle.dofs[curr_dims[2]][l], handle.dofs[curr_dims[1]][l], - handle.dofs[curr_dims[0]][l], queue_idx, - handle.auto_tuning_cc[handle.arch][handle.precision][range_l]); - } else { - gpk_rev( - handle, handle.shapes_h[l], handle.shapes_d[l], - handle.shapes_d[l + 1], doutput.ldvs_d, dinput.ldvs_d, - handle.unprocessed_n[unprocessed_idx], - handle.unprocessed_dims_d[unprocessed_idx], curr_dims[2], - curr_dims[1], curr_dims[0], handle.ratio[curr_dims[2]][l], - handle.ratio[curr_dims[1]][l], handle.ratio[curr_dims[0]][l], - doutput.dv, doutput.lddv1, doutput.lddv2, dcoarse.dv, dcoarse.lddv1, - dcoarse.lddv2, - // null, lddv1, lddv2, - dcoeff_f.dv, dcoeff_f.lddv1, dcoeff_f.lddv2, - // null, lddv1, lddv2, - dcoeff_c.dv, dcoeff_c.lddv1, dcoeff_c.lddv2, - // null, lddv1, lddv2, - dcoeff_r.dv, dcoeff_r.lddv1, dcoeff_r.lddv2, - // null, lddv1, lddv2, - dcoeff_cf.dv, dcoeff_cf.lddv1, dcoeff_cf.lddv2, - // null, lddv1, lddv2, - dcoeff_rf.dv, dcoeff_rf.lddv1, dcoeff_rf.lddv2, - // null, lddv1, lddv2, - dcoeff_rc.dv, dcoeff_rc.lddv1, dcoeff_rc.lddv2, - // null, lddv1, lddv2, - dcoeff_rcf.dv, dcoeff_rcf.lddv1, dcoeff_rcf.lddv2, - // null, lddv1, lddv2, - 0, 0, 0, handle.dofs[curr_dims[2]][l], handle.dofs[curr_dims[1]][l], - handle.dofs[curr_dims[0]][l], queue_idx, - handle.auto_tuning_cc[handle.arch][handle.precision][range_l]); - - for (DIM d = 3; d < D; d += 2) { - // copy back to input for reordering again - lwpk(handle, handle.shapes_h[l], handle.shapes_d[l], - doutput.dv, doutput.ldvs_d, dinput.dv, dinput.ldvs_d, - queue_idx); - - curr_dims[0] = 0; - curr_dims[1] = d; - curr_dims[2] = d + 1; - dinput.project(curr_dims[0], curr_dims[1], curr_dims[2]); - doutput.project(curr_dims[0], curr_dims[1], - curr_dims[2]); // reuse input1 as temp space - calc_coeff_pointers(handle, curr_dims, l, dinput, dcoarse, dcoeff_f, - dcoeff_c, dcoeff_r, dcoeff_cf, dcoeff_rf, dcoeff_rc, - dcoeff_rcf); - - if (D - d == 1) { - printf("reorder-restore %u-%uD\n", d + 1, d + 1); - unprocessed_idx += 1; - gpk_rev( - handle, handle.shapes_h[l], handle.shapes_d[l], - handle.shapes_d[l + 1], doutput.ldvs_d, dinput.ldvs_d, - handle.unprocessed_n[unprocessed_idx], - handle.unprocessed_dims_d[unprocessed_idx], curr_dims[2], - curr_dims[1], curr_dims[0], handle.ratio[curr_dims[2]][l], - handle.ratio[curr_dims[1]][l], handle.ratio[curr_dims[0]][l], - doutput.dv, doutput.lddv1, doutput.lddv2, dcoarse.dv, - dcoarse.lddv1, dcoarse.lddv2, - // null, lddv1, lddv2, - dcoeff_f.dv, dcoeff_f.lddv1, dcoeff_f.lddv2, - // null, lddv1, lddv2, - dcoeff_c.dv, dcoeff_c.lddv1, dcoeff_c.lddv2, - // null, lddv1, lddv2, - dcoeff_r.dv, dcoeff_r.lddv1, dcoeff_r.lddv2, - // null, lddv1, lddv2, - dcoeff_cf.dv, dcoeff_cf.lddv1, dcoeff_cf.lddv2, - // null, lddv1, lddv2, - dcoeff_rf.dv, dcoeff_rf.lddv1, dcoeff_rf.lddv2, - // null, lddv1, lddv2, - dcoeff_rc.dv, dcoeff_rc.lddv1, dcoeff_rc.lddv2, - // null, lddv1, lddv2, - dcoeff_rcf.dv, dcoeff_rcf.lddv1, dcoeff_rcf.lddv2, - // null, lddv1, lddv2, - 0, 0, 0, handle.dofs[curr_dims[2]][l], - handle.dofs[curr_dims[1]][l], handle.dofs[curr_dims[0]][l], - queue_idx, - handle.auto_tuning_cc[handle.arch][handle.precision][range_l]); - } else { - printf("reorder-restore %u-%uD\n", d + 1, d + 2); - unprocessed_idx += 2; - gpk_rev( - handle, handle.shapes_h[l], handle.shapes_d[l], - handle.shapes_d[l + 1], doutput.ldvs_d, dinput.ldvs_d, - handle.unprocessed_n[unprocessed_idx], - handle.unprocessed_dims_d[unprocessed_idx], curr_dims[2], - curr_dims[1], curr_dims[0], handle.ratio[curr_dims[2]][l], - handle.ratio[curr_dims[1]][l], handle.ratio[curr_dims[0]][l], - doutput.dv, doutput.lddv1, doutput.lddv2, dcoarse.dv, - dcoarse.lddv1, dcoarse.lddv2, - // null, lddv1, lddv2, - dcoeff_f.dv, dcoeff_f.lddv1, dcoeff_f.lddv2, - // null, lddv1, lddv2, - dcoeff_c.dv, dcoeff_c.lddv1, dcoeff_c.lddv2, - // null, lddv1, lddv2, - dcoeff_r.dv, dcoeff_r.lddv1, dcoeff_r.lddv2, - // null, lddv1, lddv2, - dcoeff_cf.dv, dcoeff_cf.lddv1, dcoeff_cf.lddv2, - // null, lddv1, lddv2, - dcoeff_rf.dv, dcoeff_rf.lddv1, dcoeff_rf.lddv2, - // null, lddv1, lddv2, - dcoeff_rc.dv, dcoeff_rc.lddv1, dcoeff_rc.lddv2, - // null, lddv1, lddv2, - dcoeff_rcf.dv, dcoeff_rcf.lddv1, dcoeff_rcf.lddv2, - // null, lddv1, lddv2, - 0, 0, 0, handle.dofs[curr_dims[2]][l], - handle.dofs[curr_dims[1]][l], handle.dofs[curr_dims[0]][l], - queue_idx, - handle.auto_tuning_cc[handle.arch][handle.precision][range_l]); - } - } - } - } -} -} // namespace mgard_x - -#endif \ No newline at end of file diff --git a/include/mgard-x/Utilities/CMakeLists.txt b/include/mgard-x/Utilities/CMakeLists.txt index 547372c92f..7105fe953f 100644 --- a/include/mgard-x/Utilities/CMakeLists.txt +++ b/include/mgard-x/Utilities/CMakeLists.txt @@ -1,5 +1,5 @@ list(APPEND MGARD_X_HEADER - ${CMAKE_CURRENT_SOURCE_DIR}/CheckEndianess.h ${CMAKE_CURRENT_SOURCE_DIR}/ErrorCollector.h + ${CMAKE_CURRENT_SOURCE_DIR}/Types.h ) set(MGARD_X_HEADER ${MGARD_X_HEADER} PARENT_SCOPE) \ No newline at end of file diff --git a/include/mgard-x/Utilities/CheckEndianess.h b/include/mgard-x/Utilities/CheckEndianess.h deleted file mode 100644 index b610ee71de..0000000000 --- a/include/mgard-x/Utilities/CheckEndianess.h +++ /dev/null @@ -1,16 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#ifndef MGARD_X_CHECK_ENDIANESS_H -#define MGARD_X_CHECK_ENDIANESS_H - -#include "../Types.h" - -namespace mgard_x { -enum endiness_type CheckEndianess(); -} -#endif \ No newline at end of file diff --git a/include/mgard-x/Utilities/ErrorCalculator.h b/include/mgard-x/Utilities/ErrorCalculator.h index b6f81748b5..9e0d3ef10a 100644 --- a/include/mgard-x/Utilities/ErrorCalculator.h +++ b/include/mgard-x/Utilities/ErrorCalculator.h @@ -11,7 +11,7 @@ // #include "../../TensorMeshHierarchy.hpp" // #include "../../TensorNorms.hpp" // #include "../../shuffle.hpp" -#include "../Types.h" +#include "Types.h" namespace mgard_x { diff --git a/include/mgard-x/Types.h b/include/mgard-x/Utilities/Types.h similarity index 91% rename from include/mgard-x/Types.h rename to include/mgard-x/Utilities/Types.h index cc06544472..d7990c71e9 100644 --- a/include/mgard-x/Types.h +++ b/include/mgard-x/Utilities/Types.h @@ -19,13 +19,13 @@ enum class decomposition_type : uint8_t { MultiDim, SingleDim }; enum class processor_type : uint8_t { CPU, GPU_CUDA, - X_Serial, + X_SERIAL, X_CUDA, X_HIP, X_SYCL }; -enum class device_type : uint8_t { Auto, Serial, CUDA, HIP, None }; +enum class device_type : uint8_t { AUTO, SERIAL, CUDA, HIP, SYCL, NONE }; enum class error_bound_type : uint8_t { REL, ABS }; enum class norm_type : uint8_t { L_Inf, L_2 }; @@ -55,7 +55,7 @@ enum class domain_decomposition_type : uint8_t { MaxDim, Linearize }; #include // #include "RuntimeX/DataStructures/Array.h" -#include "Hierarchy.h" +#include "../Hierarchy/Hierarchy.h" // #include "RuntimeX/Messages/Message.h" // #include "ErrorCalculator.h" // #include "MemoryManagement.h" diff --git a/src/mgard-x/CMakeLists.txt b/src/mgard-x/CMakeLists.txt index d712572eeb..b7bca50012 100644 --- a/src/mgard-x/CMakeLists.txt +++ b/src/mgard-x/CMakeLists.txt @@ -1,9 +1,9 @@ add_subdirectory (DataRefactoring) -add_subdirectory (HighLevelAPI) +add_subdirectory (CompressionLowLevel) +add_subdirectory (CompressionHighLevel) add_subdirectory (RuntimeX) -add_subdirectory (CompressionWorkflow) -add_subdirectory (Utilities) set(MGARD_X_SERIAL_SRC ${MGARD_X_SERIAL_SRC} PARENT_SCOPE) set(MGARD_X_CUDA_SRC ${MGARD_X_CUDA_SRC} PARENT_SCOPE) set(MGARD_X_HIP_SRC ${MGARD_X_HIP_SRC} PARENT_SCOPE) +set(MGARD_X_SYCL_SRC ${MGARD_X_SYCL_SRC} PARENT_SCOPE) set(MGARD_X_SRC ${MGARD_X_SRC} PARENT_SCOPE) diff --git a/src/mgard-x/CompressionHighLevel/CMakeLists.txt b/src/mgard-x/CompressionHighLevel/CMakeLists.txt new file mode 100644 index 0000000000..ef7188197a --- /dev/null +++ b/src/mgard-x/CompressionHighLevel/CMakeLists.txt @@ -0,0 +1,11 @@ +MgardXGenerateSourceAllDevices("Compress") +MgardXGenerateSourceAllDevices("Decompress") +set(MGARD_X_SERIAL_SRC ${MGARD_X_SERIAL_SRC} PARENT_SCOPE) +set(MGARD_X_CUDA_SRC ${MGARD_X_CUDA_SRC} PARENT_SCOPE) +set(MGARD_X_HIP_SRC ${MGARD_X_HIP_SRC} PARENT_SCOPE) +set(MGARD_X_SYCL_SRC ${MGARD_X_SYCL_SRC} PARENT_SCOPE) + +list(APPEND MGARD_X_SRC + ${CMAKE_CURRENT_SOURCE_DIR}/DynamicAPI.cpp) + +set(MGARD_X_SRC ${MGARD_X_SRC} PARENT_SCOPE) diff --git a/src/mgard-x/CompressionHighLevel/Compress.cpp.in b/src/mgard-x/CompressionHighLevel/Compress.cpp.in new file mode 100644 index 0000000000..d1d67079bd --- /dev/null +++ b/src/mgard-x/CompressionHighLevel/Compress.cpp.in @@ -0,0 +1,38 @@ +/* + * Copyright 2022, Oak Ridge National Laboratory. + * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs + * Author: Jieyang Chen (chenj3@ornl.gov) + * Date: March 17, 2022 + */ + +#include "mgard-x/CompressionHighLevel/CompressionHighLevel.hpp" +// clang-format off +namespace mgard_x { + +template void compress<@DEVICE_TYPE@>(DIM D, data_type dtype, + std::vector shape, double tol, + double s, enum error_bound_type mode, + const void *original_data, + void *&compressed_data, + size_t &compressed_size, Config config, + bool output_pre_allocated); + +template void +compress<@DEVICE_TYPE@>(DIM D, data_type dtype, std::vector shape, + double tol, double s, enum error_bound_type mode, + const void *original_data, void *&compressed_data, + size_t &compressed_size, bool output_pre_allocated); + +template void compress<@DEVICE_TYPE@>( + DIM D, data_type dtype, std::vector shape, double tol, double s, + enum error_bound_type mode, const void *original_data, + void *&compressed_data, size_t &compressed_size, + std::vector coords, Config config, bool output_pre_allocated); + +template void compress<@DEVICE_TYPE@>( + DIM D, data_type dtype, std::vector shape, double tol, double s, + enum error_bound_type mode, const void *original_data, + void *&compressed_data, size_t &compressed_size, + std::vector coords, bool output_pre_allocated); +} // namespace mgard_x +// clang-format on \ No newline at end of file diff --git a/src/mgard-x/CompressionHighLevel/Decompress.cpp.in b/src/mgard-x/CompressionHighLevel/Decompress.cpp.in new file mode 100644 index 0000000000..91e0fafee4 --- /dev/null +++ b/src/mgard-x/CompressionHighLevel/Decompress.cpp.in @@ -0,0 +1,35 @@ +/* + * Copyright 2022, Oak Ridge National Laboratory. + * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs + * Author: Jieyang Chen (chenj3@ornl.gov) + * Date: March 17, 2022 + */ + +#include "mgard-x/CompressionHighLevel/CompressionHighLevel.hpp" +// clang-format off +namespace mgard_x { + +template void decompress<@DEVICE_TYPE@>(const void *compressed_data, + size_t compressed_size, + void *&decompressed_data, + Config config, + bool output_pre_allocated); + +template void decompress<@DEVICE_TYPE@>(const void *compressed_data, + size_t compressed_size, + void *&decompressed_data, + bool output_pre_allocated); + +template void +decompress<@DEVICE_TYPE@>(const void *compressed_data, size_t compressed_size, + void *&decompressed_data, data_type &dtype, + std::vector &shape, Config config, + bool output_pre_allocated); + +template void +decompress<@DEVICE_TYPE@>(const void *compressed_data, size_t compressed_size, + void *&decompressed_data, data_type &dtype, + std::vector &shape, bool output_pre_allocated); + +} // namespace mgard_x +// clang-format on \ No newline at end of file diff --git a/src/mgard-x/HighLevelAPI/DynamicAPI.cpp b/src/mgard-x/CompressionHighLevel/DynamicAPI.cpp similarity index 67% rename from src/mgard-x/HighLevelAPI/DynamicAPI.cpp rename to src/mgard-x/CompressionHighLevel/DynamicAPI.cpp index 1127b1a6cc..c3b576826a 100644 --- a/src/mgard-x/HighLevelAPI/DynamicAPI.cpp +++ b/src/mgard-x/CompressionHighLevel/DynamicAPI.cpp @@ -11,19 +11,18 @@ #include #include -#include "MGARDXConfig.h" #include "compress_x.hpp" -#include "mgard-x/Hierarchy.h" -#include "mgard-x/HighLevelAPI.h" -#include "mgard-x/Metadata.hpp" +#include "mgard-x/CompressionHighLevel/CompressionHighLevel.h" +#include "mgard-x/CompressionHighLevel/Metadata.hpp" +#include "mgard-x/Hierarchy/Hierarchy.h" #include "mgard-x/RuntimeX/RuntimeXPublic.h" namespace mgard_x { enum device_type auto_detect_device() { - enum device_type dev_type = device_type::None; + enum device_type dev_type = device_type::NONE; #if MGARD_ENABLE_SERIAL - dev_type = device_type::Serial; + dev_type = device_type::SERIAL; #endif #if MGARD_ENABLE_CUDA if (deviceAvailable()) { @@ -35,7 +34,12 @@ enum device_type auto_detect_device() { dev_type = device_type::HIP; } #endif - if (dev_type == device_type::None) { +#if MGARD_ENABLE_SYCL + if (deviceAvailable()) { + dev_type = device_type::SYCL; + } +#endif + if (dev_type == device_type::NONE) { std::cout << log::log_err << "MGARD-X was not built with any backend.\n"; exit(-1); } @@ -48,17 +52,17 @@ void compress(DIM D, data_type dtype, std::vector shape, double tol, bool output_pre_allocated) { enum device_type dev_type = config.dev_type; - if (dev_type == device_type::Auto) { + if (dev_type == device_type::AUTO) { dev_type = auto_detect_device(); } - if (dev_type == device_type::Serial) { + if (dev_type == device_type::SERIAL) { #if MGARD_ENABLE_SERIAL - compress(D, dtype, shape, tol, s, mode, original_data, + compress(D, dtype, shape, tol, s, mode, original_data, compressed_data, compressed_size, config, output_pre_allocated); #else - std::cout << log::log_err << "MGARD-X was not built with Serial backend.\n"; + std::cout << log::log_err << "MGARD-X was not built with SERIAL backend.\n"; exit(-1); #endif } else if (dev_type == device_type::CUDA) { @@ -77,6 +81,15 @@ void compress(DIM D, data_type dtype, std::vector shape, double tol, #else std::cout << log::log_err << "MGARD-X was not built with HIP backend.\n"; exit(-1); +#endif + } else if (dev_type == device_type::SYCL) { +#if MGARD_ENABLE_SYCL + compress(D, dtype, shape, tol, s, mode, original_data, + compressed_data, compressed_size, config, + output_pre_allocated); +#else + std::cout << log::log_err << "MGARD-X was not built with SYCL backend.\n"; + exit(-1); #endif } else { std::cout << log::log_err << "Unsupported backend.\n"; @@ -90,12 +103,12 @@ void compress(DIM D, data_type dtype, std::vector shape, double tol, enum device_type dev_type = auto_detect_device(); - if (dev_type == device_type::Serial) { + if (dev_type == device_type::SERIAL) { #if MGARD_ENABLE_SERIAL - compress(D, dtype, shape, tol, s, mode, original_data, + compress(D, dtype, shape, tol, s, mode, original_data, compressed_data, compressed_size, output_pre_allocated); #else - std::cout << log::log_err << "MGARD-X was not built with Serial backend.\n"; + std::cout << log::log_err << "MGARD-X was not built with SERIAL backend.\n"; exit(-1); #endif } else if (dev_type == device_type::CUDA) { @@ -113,6 +126,14 @@ void compress(DIM D, data_type dtype, std::vector shape, double tol, #else std::cout << log::log_err << "MGARD-X was not built with HIP backend.\n"; exit(-1); +#endif + } else if (dev_type == device_type::SYCL) { +#if MGARD_ENABLE_SYCL + compress(D, dtype, shape, tol, s, mode, original_data, + compressed_data, compressed_size, output_pre_allocated); +#else + std::cout << log::log_err << "MGARD-X was not built with SYCL backend.\n"; + exit(-1); #endif } else { std::cout << log::log_err << "Unsupported backend.\n"; @@ -126,17 +147,17 @@ void compress(DIM D, data_type dtype, std::vector shape, double tol, bool output_pre_allocated) { enum device_type dev_type = config.dev_type; - if (dev_type == device_type::Auto) { + if (dev_type == device_type::AUTO) { dev_type = auto_detect_device(); } - if (dev_type == device_type::Serial) { + if (dev_type == device_type::SERIAL) { #if MGARD_ENABLE_SERIAL - compress(D, dtype, shape, tol, s, mode, original_data, + compress(D, dtype, shape, tol, s, mode, original_data, compressed_data, compressed_size, coords, config, output_pre_allocated); #else - std::cout << log::log_err << "MGARD-X was not built with Serial backend.\n"; + std::cout << log::log_err << "MGARD-X was not built with SERIAL backend.\n"; exit(-1); #endif } else if (dev_type == device_type::CUDA) { @@ -155,6 +176,15 @@ void compress(DIM D, data_type dtype, std::vector shape, double tol, #else std::cout << log::log_err << "MGARD-X was not built with HIP backend.\n"; exit(-1); +#endif + } else if (dev_type == device_type::SYCL) { +#if MGARD_ENABLE_SYCL + compress(D, dtype, shape, tol, s, mode, original_data, + compressed_data, compressed_size, coords, config, + output_pre_allocated); +#else + std::cout << log::log_err << "MGARD-X was not built with SYCL backend.\n"; + exit(-1); #endif } else { std::cout << log::log_err << "Unsupported backend.\n"; @@ -168,13 +198,13 @@ void compress(DIM D, data_type dtype, std::vector shape, double tol, enum device_type dev_type = auto_detect_device(); - if (dev_type == device_type::Serial) { + if (dev_type == device_type::SERIAL) { #if MGARD_ENABLE_SERIAL - compress(D, dtype, shape, tol, s, mode, original_data, + compress(D, dtype, shape, tol, s, mode, original_data, compressed_data, compressed_size, coords, output_pre_allocated); #else - std::cout << log::log_err << "MGARD-X was not built with Serial backend.\n"; + std::cout << log::log_err << "MGARD-X was not built with SERIAL backend.\n"; exit(-1); #endif } else if (dev_type == device_type::CUDA) { @@ -193,6 +223,15 @@ void compress(DIM D, data_type dtype, std::vector shape, double tol, #else std::cout << log::log_err << "MGARD-X was not built with HIP backend.\n"; exit(-1); +#endif + } else if (dev_type == device_type::SYCL) { +#if MGARD_ENABLE_SYCL + compress(D, dtype, shape, tol, s, mode, original_data, + compressed_data, compressed_size, coords, + output_pre_allocated); +#else + std::cout << log::log_err << "MGARD-X was not built with SYCL backend.\n"; + exit(-1); #endif } else { std::cout << log::log_err << "Unsupported backend.\n"; @@ -204,16 +243,16 @@ void decompress(const void *compressed_data, size_t compressed_size, bool output_pre_allocated) { enum device_type dev_type = config.dev_type; - if (dev_type == device_type::Auto) { + if (dev_type == device_type::AUTO) { dev_type = auto_detect_device(); } - if (dev_type == device_type::Serial) { + if (dev_type == device_type::SERIAL) { #if MGARD_ENABLE_SERIAL - decompress(compressed_data, compressed_size, decompressed_data, + decompress(compressed_data, compressed_size, decompressed_data, config, output_pre_allocated); #else - std::cout << log::log_err << "MGARD-X was not built with Serial backend.\n"; + std::cout << log::log_err << "MGARD-X was not built with SERIAL backend.\n"; exit(-1); #endif } else if (dev_type == device_type::CUDA) { @@ -231,6 +270,14 @@ void decompress(const void *compressed_data, size_t compressed_size, #else std::cout << log::log_err << "MGARD-X was not built with HIP backend.\n"; exit(-1); +#endif + } else if (dev_type == device_type::SYCL) { +#if MGARD_ENABLE_SYCL + decompress(compressed_data, compressed_size, decompressed_data, + config, output_pre_allocated); +#else + std::cout << log::log_err << "MGARD-X was not built with SYCL backend.\n"; + exit(-1); #endif } else { std::cout << log::log_err << "Unsupported backend.\n"; @@ -242,12 +289,12 @@ void decompress(const void *compressed_data, size_t compressed_size, enum device_type dev_type = auto_detect_device(); - if (dev_type == device_type::Serial) { + if (dev_type == device_type::SERIAL) { #if MGARD_ENABLE_SERIAL - decompress(compressed_data, compressed_size, decompressed_data, + decompress(compressed_data, compressed_size, decompressed_data, output_pre_allocated); #else - std::cout << log::log_err << "MGARD-X was not built with Serial backend.\n"; + std::cout << log::log_err << "MGARD-X was not built with SERIAL backend.\n"; exit(-1); #endif } else if (dev_type == device_type::CUDA) { @@ -265,6 +312,14 @@ void decompress(const void *compressed_data, size_t compressed_size, #else std::cout << log::log_err << "MGARD-X was not built with HIP backend.\n"; exit(-1); +#endif + } else if (dev_type == device_type::SYCL) { +#if MGARD_ENABLE_SYCL + decompress(compressed_data, compressed_size, decompressed_data, + output_pre_allocated); +#else + std::cout << log::log_err << "MGARD-X was not built with SYCL backend.\n"; + exit(-1); #endif } else { std::cout << log::log_err << "Unsupported backend.\n"; @@ -276,16 +331,16 @@ void decompress(const void *compressed_data, size_t compressed_size, data_type &dtype, Config config, bool output_pre_allocated) { enum device_type dev_type = config.dev_type; - if (dev_type == device_type::Auto) { + if (dev_type == device_type::AUTO) { dev_type = auto_detect_device(); } - if (dev_type == device_type::Serial) { + if (dev_type == device_type::SERIAL) { #if MGARD_ENABLE_SERIAL - decompress(compressed_data, compressed_size, decompressed_data, + decompress(compressed_data, compressed_size, decompressed_data, dtype, shape, config, output_pre_allocated); #else - std::cout << log::log_err << "MGARD-X was not built with Serial backend.\n"; + std::cout << log::log_err << "MGARD-X was not built with SERIAL backend.\n"; exit(-1); #endif } else if (dev_type == device_type::CUDA) { @@ -303,6 +358,14 @@ void decompress(const void *compressed_data, size_t compressed_size, #else std::cout << log::log_err << "MGARD-X was not built with HIP backend.\n"; exit(-1); +#endif + } else if (dev_type == device_type::SYCL) { +#if MGARD_ENABLE_SYCL + decompress(compressed_data, compressed_size, decompressed_data, dtype, + shape, config, output_pre_allocated); +#else + std::cout << log::log_err << "MGARD-X was not built with SYCL backend.\n"; + exit(-1); #endif } else { std::cout << log::log_err << "Unsupported backend.\n"; @@ -315,12 +378,12 @@ void decompress(const void *compressed_data, size_t compressed_size, enum device_type dev_type = auto_detect_device(); - if (dev_type == device_type::Serial) { + if (dev_type == device_type::SERIAL) { #if MGARD_ENABLE_SERIAL - decompress(compressed_data, compressed_size, decompressed_data, + decompress(compressed_data, compressed_size, decompressed_data, dtype, shape, output_pre_allocated); #else - std::cout << log::log_err << "MGARD-X was not built with Serial backend.\n"; + std::cout << log::log_err << "MGARD-X was not built with SERIAL backend.\n"; exit(-1); #endif } else if (dev_type == device_type::CUDA) { @@ -338,6 +401,14 @@ void decompress(const void *compressed_data, size_t compressed_size, #else std::cout << log::log_err << "MGARD-X was not built with HIP backend.\n"; exit(-1); +#endif + } else if (dev_type == device_type::SYCL) { +#if MGARD_ENABLE_SYCL + decompress(compressed_data, compressed_size, decompressed_data, dtype, + shape, output_pre_allocated); +#else + std::cout << log::log_err << "MGARD-X was not built with SYCL backend.\n"; + exit(-1); #endif } else { std::cout << log::log_err << "Unsupported backend.\n"; @@ -346,15 +417,15 @@ void decompress(const void *compressed_data, size_t compressed_size, void BeginAutoTuning(enum device_type dev_type) { - if (dev_type == device_type::Auto) { + if (dev_type == device_type::AUTO) { dev_type = auto_detect_device(); } - if (dev_type == device_type::Serial) { + if (dev_type == device_type::SERIAL) { #if MGARD_ENABLE_SERIAL - mgard_x::BeginAutoTuning(); + mgard_x::BeginAutoTuning(); #else - std::cout << log::log_err << "MGARD-X was not built with Serial backend.\n"; + std::cout << log::log_err << "MGARD-X was not built with SERIAL backend.\n"; exit(-1); #endif } else if (dev_type == device_type::CUDA) { @@ -370,6 +441,13 @@ void BeginAutoTuning(enum device_type dev_type) { #else std::cout << log::log_err << "MGARD-X was not built with HIP backend.\n"; exit(-1); +#endif + } else if (dev_type == device_type::SYCL) { +#if MGARD_ENABLE_SYCL + mgard_x::BeginAutoTuning(); +#else + std::cout << log::log_err << "MGARD-X was not built with SYCL backend.\n"; + exit(-1); #endif } else { std::cout << log::log_err << "Unsupported backend.\n"; @@ -378,15 +456,15 @@ void BeginAutoTuning(enum device_type dev_type) { void EndAutoTuning(enum device_type dev_type) { - if (dev_type == device_type::Auto) { + if (dev_type == device_type::AUTO) { dev_type = auto_detect_device(); } - if (dev_type == device_type::Serial) { + if (dev_type == device_type::SERIAL) { #if MGARD_ENABLE_SERIAL - mgard_x::EndAutoTuning(); + mgard_x::EndAutoTuning(); #else - std::cout << log::log_err << "MGARD-X was not built with Serial backend.\n"; + std::cout << log::log_err << "MGARD-X was not built with SERIAL backend.\n"; exit(-1); #endif } else if (dev_type == device_type::CUDA) { @@ -402,6 +480,13 @@ void EndAutoTuning(enum device_type dev_type) { #else std::cout << log::log_err << "MGARD-X was not built with HIP backend.\n"; exit(-1); +#endif + } else if (dev_type == device_type::SYCL) { +#if MGARD_ENABLE_SYCL + mgard_x::EndAutoTuning(); +#else + std::cout << log::log_err << "MGARD-X was not built with SYCL backend.\n"; + exit(-1); #endif } else { std::cout << log::log_err << "Unsupported backend.\n"; diff --git a/src/mgard-x/CompressionLowLevel/CMakeLists.txt b/src/mgard-x/CompressionLowLevel/CMakeLists.txt new file mode 100644 index 0000000000..c10580c777 --- /dev/null +++ b/src/mgard-x/CompressionLowLevel/CMakeLists.txt @@ -0,0 +1,6 @@ +MgardXGenerateSourceAllCombinations("Compress") +MgardXGenerateSourceAllCombinations("Decompress") +set(MGARD_X_SERIAL_SRC ${MGARD_X_SERIAL_SRC} PARENT_SCOPE) +set(MGARD_X_CUDA_SRC ${MGARD_X_CUDA_SRC} PARENT_SCOPE) +set(MGARD_X_HIP_SRC ${MGARD_X_HIP_SRC} PARENT_SCOPE) +set(MGARD_X_SYCL_SRC ${MGARD_X_SYCL_SRC} PARENT_SCOPE) \ No newline at end of file diff --git a/src/mgard-x/CompressionLowLevel/Compress.cpp.in b/src/mgard-x/CompressionLowLevel/Compress.cpp.in new file mode 100644 index 0000000000..f042976823 --- /dev/null +++ b/src/mgard-x/CompressionLowLevel/Compress.cpp.in @@ -0,0 +1,20 @@ +/* + * Copyright 2022, Oak Ridge National Laboratory. + * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs + * Author: Jieyang Chen (chenj3@ornl.gov) + * Date: March 17, 2022 + */ + +#include "mgard-x/CompressionLowLevel/CompressionLowLevel.hpp" +// clang-format off +namespace mgard_x { + +template Array<1, unsigned char, @DEVICE_TYPE@> +compress<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@>( + Hierarchy<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> &hierarchy, + Array<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> &in_array, + enum error_bound_type type, @DATA_TYPE@ tol, @DATA_TYPE@ s, + @DATA_TYPE@ &norm, Config config); + +} // namespace mgard_x +// clang-format on \ No newline at end of file diff --git a/src/mgard-x/CompressionLowLevel/Decompress.cpp.in b/src/mgard-x/CompressionLowLevel/Decompress.cpp.in new file mode 100644 index 0000000000..49a7bb4f0d --- /dev/null +++ b/src/mgard-x/CompressionLowLevel/Decompress.cpp.in @@ -0,0 +1,20 @@ +/* + * Copyright 2022, Oak Ridge National Laboratory. + * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs + * Author: Jieyang Chen (chenj3@ornl.gov) + * Date: March 17, 2022 + */ + +#include "mgard-x/CompressionLowLevel/CompressionLowLevel.hpp" +// clang-format off +namespace mgard_x { + +template Array<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> +decompress<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@>( + Hierarchy<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> &hierarchy, + Array<1, unsigned char, @DEVICE_TYPE@> &compressed_array, + enum error_bound_type type, @DATA_TYPE@ tol, @DATA_TYPE@ s, + @DATA_TYPE@ norm, Config config); + +} // namespace mgard_x +// clang-format on \ No newline at end of file diff --git a/src/mgard-x/CompressionWorkflow/CMakeLists.txt b/src/mgard-x/CompressionWorkflow/CMakeLists.txt deleted file mode 100644 index 82df5a06d0..0000000000 --- a/src/mgard-x/CompressionWorkflow/CMakeLists.txt +++ /dev/null @@ -1,12 +0,0 @@ -if(MGARD_ENABLE_SERIAL) - add_subdirectory (Serial) - set(MGARD_X_SERIAL_SRC ${MGARD_X_SERIAL_SRC} PARENT_SCOPE) -endif() -if(MGARD_ENABLE_CUDA) - add_subdirectory (CUDA) - set(MGARD_X_CUDA_SRC ${MGARD_X_CUDA_SRC} PARENT_SCOPE) -endif() -if(MGARD_ENABLE_HIP) - add_subdirectory (HIP) - set(MGARD_X_HIP_SRC ${MGARD_X_HIP_SRC} PARENT_SCOPE) -endif() \ No newline at end of file diff --git a/src/mgard-x/CompressionWorkflow/CUDA/CMakeLists.txt b/src/mgard-x/CompressionWorkflow/CUDA/CMakeLists.txt deleted file mode 100644 index 3ea070d9a5..0000000000 --- a/src/mgard-x/CompressionWorkflow/CUDA/CMakeLists.txt +++ /dev/null @@ -1,4 +0,0 @@ -list(APPEND MGARD_X_CUDA_SRC - ${CMAKE_CURRENT_SOURCE_DIR}/CompressionWorkflow.cu) - -set(MGARD_X_CUDA_SRC ${MGARD_X_CUDA_SRC} PARENT_SCOPE) \ No newline at end of file diff --git a/src/mgard-x/CompressionWorkflow/CUDA/CompressionWorkflow.cu b/src/mgard-x/CompressionWorkflow/CUDA/CompressionWorkflow.cu deleted file mode 100644 index 16163a1813..0000000000 --- a/src/mgard-x/CompressionWorkflow/CUDA/CompressionWorkflow.cu +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/CompressionWorkflow.hpp" - -#include - -#include -namespace mgard_x { - -#define KERNELS(D, T) \ - template Array<1, unsigned char, CUDA> compress( \ - Hierarchy & hierarchy, Array & in_array, \ - enum error_bound_type type, T tol, T s, T & norm, Config config); \ - template Array decompress( \ - Hierarchy & hierarchy, \ - Array<1, unsigned char, CUDA> & compressed_array, \ - enum error_bound_type type, T tol, T s, T norm, Config config); - -KERNELS(1, double) -KERNELS(1, float) -KERNELS(2, double) -KERNELS(2, float) -KERNELS(3, double) -KERNELS(3, float) -KERNELS(4, double) -KERNELS(4, float) -KERNELS(5, double) -KERNELS(5, float) -#undef KERNELS - -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/CompressionWorkflow/HIP/CMakeLists.txt b/src/mgard-x/CompressionWorkflow/HIP/CMakeLists.txt deleted file mode 100644 index 4360eb9402..0000000000 --- a/src/mgard-x/CompressionWorkflow/HIP/CMakeLists.txt +++ /dev/null @@ -1,4 +0,0 @@ -list(APPEND MGARD_X_HIP_SRC - ${CMAKE_CURRENT_SOURCE_DIR}/CompressionWorkflow.cpp) - -set(MGARD_X_HIP_SRC ${MGARD_X_HIP_SRC} PARENT_SCOPE) \ No newline at end of file diff --git a/src/mgard-x/CompressionWorkflow/HIP/CompressionWorkflow.cpp b/src/mgard-x/CompressionWorkflow/HIP/CompressionWorkflow.cpp deleted file mode 100644 index f2df577737..0000000000 --- a/src/mgard-x/CompressionWorkflow/HIP/CompressionWorkflow.cpp +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/CompressionWorkflow.hpp" - -#include - -#include -namespace mgard_x { - -#define KERNELS(D, T) \ - template Array<1, unsigned char, HIP> compress( \ - Hierarchy & hierarchy, Array & in_array, \ - enum error_bound_type type, T tol, T s, T & norm, Config config); \ - template Array decompress( \ - Hierarchy & hierarchy, \ - Array<1, unsigned char, HIP> & compressed_array, \ - enum error_bound_type type, T tol, T s, T norm, Config config); - -KERNELS(1, double) -KERNELS(1, float) -KERNELS(2, double) -KERNELS(2, float) -KERNELS(3, double) -KERNELS(3, float) -KERNELS(4, double) -KERNELS(4, float) -KERNELS(5, double) -KERNELS(5, float) -#undef KERNELS - -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/CompressionWorkflow/Serial/CMakeLists.txt b/src/mgard-x/CompressionWorkflow/Serial/CMakeLists.txt deleted file mode 100644 index a8196d79a3..0000000000 --- a/src/mgard-x/CompressionWorkflow/Serial/CMakeLists.txt +++ /dev/null @@ -1,4 +0,0 @@ -list(APPEND MGARD_X_SERIAL_SRC - ${CMAKE_CURRENT_SOURCE_DIR}/CompressionWorkflow.cpp) - -set(MGARD_X_SERIAL_SRC ${MGARD_X_SERIAL_SRC} PARENT_SCOPE) \ No newline at end of file diff --git a/src/mgard-x/CompressionWorkflow/Serial/CompressionWorkflow.cpp b/src/mgard-x/CompressionWorkflow/Serial/CompressionWorkflow.cpp deleted file mode 100644 index a745792615..0000000000 --- a/src/mgard-x/CompressionWorkflow/Serial/CompressionWorkflow.cpp +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/CompressionWorkflow.hpp" - -#include - -#include -namespace mgard_x { - -#define KERNELS(D, T) \ - template Array<1, unsigned char, Serial> compress( \ - Hierarchy & hierarchy, Array & in_array, \ - enum error_bound_type type, T tol, T s, T & norm, Config config); \ - template Array decompress( \ - Hierarchy & hierarchy, \ - Array<1, unsigned char, Serial> & compressed_array, \ - enum error_bound_type type, T tol, T s, T norm, Config config); - -KERNELS(1, double) -KERNELS(1, float) -KERNELS(2, double) -KERNELS(2, float) -KERNELS(3, double) -KERNELS(3, float) -KERNELS(4, double) -KERNELS(4, float) -KERNELS(5, double) -KERNELS(5, float) -#undef KERNELS - -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/CMakeLists.txt b/src/mgard-x/DataRefactoring/CMakeLists.txt index 82df5a06d0..21b0652e7f 100644 --- a/src/mgard-x/DataRefactoring/CMakeLists.txt +++ b/src/mgard-x/DataRefactoring/CMakeLists.txt @@ -1,12 +1,6 @@ -if(MGARD_ENABLE_SERIAL) - add_subdirectory (Serial) - set(MGARD_X_SERIAL_SRC ${MGARD_X_SERIAL_SRC} PARENT_SCOPE) -endif() -if(MGARD_ENABLE_CUDA) - add_subdirectory (CUDA) - set(MGARD_X_CUDA_SRC ${MGARD_X_CUDA_SRC} PARENT_SCOPE) -endif() -if(MGARD_ENABLE_HIP) - add_subdirectory (HIP) - set(MGARD_X_HIP_SRC ${MGARD_X_HIP_SRC} PARENT_SCOPE) -endif() \ No newline at end of file +add_subdirectory (MultiDimension) +add_subdirectory (SingleDimension) +set(MGARD_X_SERIAL_SRC ${MGARD_X_SERIAL_SRC} PARENT_SCOPE) +set(MGARD_X_CUDA_SRC ${MGARD_X_CUDA_SRC} PARENT_SCOPE) +set(MGARD_X_HIP_SRC ${MGARD_X_HIP_SRC} PARENT_SCOPE) +set(MGARD_X_SYCL_SRC ${MGARD_X_SYCL_SRC} PARENT_SCOPE) \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/CUDA/CMakeLists.txt b/src/mgard-x/DataRefactoring/CUDA/CMakeLists.txt deleted file mode 100644 index 62c32de19d..0000000000 --- a/src/mgard-x/DataRefactoring/CUDA/CMakeLists.txt +++ /dev/null @@ -1,24 +0,0 @@ -list(APPEND MGARD_X_CUDA_SRC - ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_1D_Float.cu - ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_1D_Double.cu - ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_1D_Float.cu - ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_1D_Double.cu - ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_2D_Float.cu - ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_2D_Double.cu - ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_2D_Float.cu - ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_2D_Double.cu - ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_3D_Float.cu - ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_3D_Double.cu - ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_3D_Float.cu - ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_3D_Double.cu - ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_4D_Float.cu - ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_4D_Double.cu - ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_4D_Float.cu - ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_4D_Double.cu - ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_5D_Float.cu - ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_5D_Double.cu - ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_5D_Float.cu - ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_5D_Double.cu - ) - -set(MGARD_X_CUDA_SRC ${MGARD_X_CUDA_SRC} PARENT_SCOPE) \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/CUDA/Decomposition_1D_Double.cu b/src/mgard-x/DataRefactoring/CUDA/Decomposition_1D_Double.cu deleted file mode 100644 index e7befd552d..0000000000 --- a/src/mgard-x/DataRefactoring/CUDA/Decomposition_1D_Double.cu +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void decompose<1, double, CUDA>(Hierarchy<1, double, CUDA> &hierarchy, - SubArray<1, double, CUDA> &v, - SIZE l_target, int queue_idx); -template void - decompose_single<1, double, CUDA>(Hierarchy<1, double, CUDA> &hierarchy, - SubArray<1, double, CUDA> &v, - SIZE l_target, int queue_idx); -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/CUDA/Decomposition_1D_Float.cu b/src/mgard-x/DataRefactoring/CUDA/Decomposition_1D_Float.cu deleted file mode 100644 index b8a014ae48..0000000000 --- a/src/mgard-x/DataRefactoring/CUDA/Decomposition_1D_Float.cu +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void decompose<1, float, CUDA>(Hierarchy<1, float, CUDA> &hierarchy, - SubArray<1, float, CUDA> &v, - SIZE l_target, int queue_idx); -template void - decompose_single<1, float, CUDA>(Hierarchy<1, float, CUDA> &hierarchy, - SubArray<1, float, CUDA> &v, SIZE l_target, - int queue_idx); -} // namespace mgard_x diff --git a/src/mgard-x/DataRefactoring/CUDA/Decomposition_2D_Double.cu b/src/mgard-x/DataRefactoring/CUDA/Decomposition_2D_Double.cu deleted file mode 100644 index 27fe8c14f3..0000000000 --- a/src/mgard-x/DataRefactoring/CUDA/Decomposition_2D_Double.cu +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void decompose<2, double, CUDA>(Hierarchy<2, double, CUDA> &hierarchy, - SubArray<2, double, CUDA> &v, - SIZE l_target, int queue_idx); -template void - decompose_single<2, double, CUDA>(Hierarchy<2, double, CUDA> &hierarchy, - SubArray<2, double, CUDA> &v, - SIZE l_target, int queue_idx); -} // namespace mgard_x diff --git a/src/mgard-x/DataRefactoring/CUDA/Decomposition_2D_Float.cu b/src/mgard-x/DataRefactoring/CUDA/Decomposition_2D_Float.cu deleted file mode 100644 index dca55294d8..0000000000 --- a/src/mgard-x/DataRefactoring/CUDA/Decomposition_2D_Float.cu +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void decompose<2, float, CUDA>(Hierarchy<2, float, CUDA> &hierarchy, - SubArray<2, float, CUDA> &v, - SIZE l_target, int queue_idx); -template void - decompose_single<2, float, CUDA>(Hierarchy<2, float, CUDA> &hierarchy, - SubArray<2, float, CUDA> &v, SIZE l_target, - int queue_idx); -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/CUDA/Decomposition_3D_Double.cu b/src/mgard-x/DataRefactoring/CUDA/Decomposition_3D_Double.cu deleted file mode 100644 index 8696ff6cd0..0000000000 --- a/src/mgard-x/DataRefactoring/CUDA/Decomposition_3D_Double.cu +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void decompose<3, double, CUDA>(Hierarchy<3, double, CUDA> &hierarchy, - SubArray<3, double, CUDA> &v, - SIZE l_target, int queue_idx); -template void - decompose_single<3, double, CUDA>(Hierarchy<3, double, CUDA> &hierarchy, - SubArray<3, double, CUDA> &v, - SIZE l_target, int queue_idx); -} // namespace mgard_x diff --git a/src/mgard-x/DataRefactoring/CUDA/Decomposition_3D_Float.cu b/src/mgard-x/DataRefactoring/CUDA/Decomposition_3D_Float.cu deleted file mode 100644 index d5b9678afe..0000000000 --- a/src/mgard-x/DataRefactoring/CUDA/Decomposition_3D_Float.cu +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void decompose<3, float, CUDA>(Hierarchy<3, float, CUDA> &hierarchy, - SubArray<3, float, CUDA> &v, - SIZE l_target, int queue_idx); -template void - decompose_single<3, float, CUDA>(Hierarchy<3, float, CUDA> &hierarchy, - SubArray<3, float, CUDA> &v, SIZE l_target, - int queue_idx); -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/CUDA/Decomposition_4D_Double.cu b/src/mgard-x/DataRefactoring/CUDA/Decomposition_4D_Double.cu deleted file mode 100644 index b60c26ee29..0000000000 --- a/src/mgard-x/DataRefactoring/CUDA/Decomposition_4D_Double.cu +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void decompose<4, double, CUDA>(Hierarchy<4, double, CUDA> &hierarchy, - SubArray<4, double, CUDA> &v, - SIZE l_target, int queue_idx); -template void - decompose_single<4, double, CUDA>(Hierarchy<4, double, CUDA> &hierarchy, - SubArray<4, double, CUDA> &v, - SIZE l_target, int queue_idx); -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/CUDA/Decomposition_4D_Float.cu b/src/mgard-x/DataRefactoring/CUDA/Decomposition_4D_Float.cu deleted file mode 100644 index b445c109bf..0000000000 --- a/src/mgard-x/DataRefactoring/CUDA/Decomposition_4D_Float.cu +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void decompose<4, float, CUDA>(Hierarchy<4, float, CUDA> &hierarchy, - SubArray<4, float, CUDA> &v, - SIZE l_target, int queue_idx); -template void - decompose_single<4, float, CUDA>(Hierarchy<4, float, CUDA> &hierarchy, - SubArray<4, float, CUDA> &v, SIZE l_target, - int queue_idx); -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/CUDA/Decomposition_5D_Double.cu b/src/mgard-x/DataRefactoring/CUDA/Decomposition_5D_Double.cu deleted file mode 100644 index 1c21fec0a9..0000000000 --- a/src/mgard-x/DataRefactoring/CUDA/Decomposition_5D_Double.cu +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void decompose<5, double, CUDA>(Hierarchy<5, double, CUDA> &hierarchy, - SubArray<5, double, CUDA> &v, - SIZE l_target, int queue_idx); -template void - decompose_single<5, double, CUDA>(Hierarchy<5, double, CUDA> &hierarchy, - SubArray<5, double, CUDA> &v, - SIZE l_target, int queue_idx); -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/CUDA/Decomposition_5D_Float.cu b/src/mgard-x/DataRefactoring/CUDA/Decomposition_5D_Float.cu deleted file mode 100644 index 75c14f6bc4..0000000000 --- a/src/mgard-x/DataRefactoring/CUDA/Decomposition_5D_Float.cu +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void decompose<5, float, CUDA>(Hierarchy<5, float, CUDA> &hierarchy, - SubArray<5, float, CUDA> &v, - SIZE l_target, int queue_idx); -template void - decompose_single<5, float, CUDA>(Hierarchy<5, float, CUDA> &hierarchy, - SubArray<5, float, CUDA> &v, SIZE l_target, - int queue_idx); -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/CUDA/Recomposition_1D_Double.cu b/src/mgard-x/DataRefactoring/CUDA/Recomposition_1D_Double.cu deleted file mode 100644 index 7dbfdf79d0..0000000000 --- a/src/mgard-x/DataRefactoring/CUDA/Recomposition_1D_Double.cu +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void recompose<1, double, CUDA>(Hierarchy<1, double, CUDA> &hierarchy, - SubArray<1, double, CUDA> &v, - SIZE l_target, int queue_idx); -template void - recompose_single<1, double, CUDA>(Hierarchy<1, double, CUDA> &hierarchy, - SubArray<1, double, CUDA> &v, - SIZE l_target, int queue_idx); -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/CUDA/Recomposition_1D_Float.cu b/src/mgard-x/DataRefactoring/CUDA/Recomposition_1D_Float.cu deleted file mode 100644 index df0f4f84e5..0000000000 --- a/src/mgard-x/DataRefactoring/CUDA/Recomposition_1D_Float.cu +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void recompose<1, float, CUDA>(Hierarchy<1, float, CUDA> &hierarchy, - SubArray<1, float, CUDA> &v, - SIZE l_target, int queue_idx); -template void - recompose_single<1, float, CUDA>(Hierarchy<1, float, CUDA> &hierarchy, - SubArray<1, float, CUDA> &v, SIZE l_target, - int queue_idx); -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/CUDA/Recomposition_2D_Double.cu b/src/mgard-x/DataRefactoring/CUDA/Recomposition_2D_Double.cu deleted file mode 100644 index ac6da649f8..0000000000 --- a/src/mgard-x/DataRefactoring/CUDA/Recomposition_2D_Double.cu +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void recompose<2, double, CUDA>(Hierarchy<2, double, CUDA> &hierarchy, - SubArray<2, double, CUDA> &v, - SIZE l_target, int queue_idx); -template void - recompose_single<2, double, CUDA>(Hierarchy<2, double, CUDA> &hierarchy, - SubArray<2, double, CUDA> &v, - SIZE l_target, int queue_idx); -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/CUDA/Recomposition_2D_Float.cu b/src/mgard-x/DataRefactoring/CUDA/Recomposition_2D_Float.cu deleted file mode 100644 index 8c9f4297a4..0000000000 --- a/src/mgard-x/DataRefactoring/CUDA/Recomposition_2D_Float.cu +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void recompose<2, float, CUDA>(Hierarchy<2, float, CUDA> &hierarchy, - SubArray<2, float, CUDA> &v, - SIZE l_target, int queue_idx); -template void - recompose_single<2, float, CUDA>(Hierarchy<2, float, CUDA> &hierarchy, - SubArray<2, float, CUDA> &v, SIZE l_target, - int queue_idx); -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/CUDA/Recomposition_3D_Double.cu b/src/mgard-x/DataRefactoring/CUDA/Recomposition_3D_Double.cu deleted file mode 100644 index 4dd374167e..0000000000 --- a/src/mgard-x/DataRefactoring/CUDA/Recomposition_3D_Double.cu +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void recompose<3, double, CUDA>(Hierarchy<3, double, CUDA> &hierarchy, - SubArray<3, double, CUDA> &v, - SIZE l_target, int queue_idx); -template void - recompose_single<3, double, CUDA>(Hierarchy<3, double, CUDA> &hierarchy, - SubArray<3, double, CUDA> &v, - SIZE l_target, int queue_idx); -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/CUDA/Recomposition_3D_Float.cu b/src/mgard-x/DataRefactoring/CUDA/Recomposition_3D_Float.cu deleted file mode 100644 index 9834cd15f8..0000000000 --- a/src/mgard-x/DataRefactoring/CUDA/Recomposition_3D_Float.cu +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void recompose<3, float, CUDA>(Hierarchy<3, float, CUDA> &hierarchy, - SubArray<3, float, CUDA> &v, - SIZE l_target, int queue_idx); -template void - recompose_single<3, float, CUDA>(Hierarchy<3, float, CUDA> &hierarchy, - SubArray<3, float, CUDA> &v, SIZE l_target, - int queue_idx); -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/CUDA/Recomposition_4D_Double.cu b/src/mgard-x/DataRefactoring/CUDA/Recomposition_4D_Double.cu deleted file mode 100644 index 242604bb42..0000000000 --- a/src/mgard-x/DataRefactoring/CUDA/Recomposition_4D_Double.cu +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void recompose<4, double, CUDA>(Hierarchy<4, double, CUDA> &hierarchy, - SubArray<4, double, CUDA> &v, - SIZE l_target, int queue_idx); -template void - recompose_single<4, double, CUDA>(Hierarchy<4, double, CUDA> &hierarchy, - SubArray<4, double, CUDA> &v, - SIZE l_target, int queue_idx); -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/CUDA/Recomposition_4D_Float.cu b/src/mgard-x/DataRefactoring/CUDA/Recomposition_4D_Float.cu deleted file mode 100644 index 5747fe88b9..0000000000 --- a/src/mgard-x/DataRefactoring/CUDA/Recomposition_4D_Float.cu +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void recompose<4, float, CUDA>(Hierarchy<4, float, CUDA> &hierarchy, - SubArray<4, float, CUDA> &v, - SIZE l_target, int queue_idx); -template void - recompose_single<4, float, CUDA>(Hierarchy<4, float, CUDA> &hierarchy, - SubArray<4, float, CUDA> &v, SIZE l_target, - int queue_idx); -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/CUDA/Recomposition_5D_Double.cu b/src/mgard-x/DataRefactoring/CUDA/Recomposition_5D_Double.cu deleted file mode 100644 index af972fd18c..0000000000 --- a/src/mgard-x/DataRefactoring/CUDA/Recomposition_5D_Double.cu +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void recompose<5, double, CUDA>(Hierarchy<5, double, CUDA> &hierarchy, - SubArray<5, double, CUDA> &v, - SIZE l_target, int queue_idx); -template void - recompose_single<5, double, CUDA>(Hierarchy<5, double, CUDA> &hierarchy, - SubArray<5, double, CUDA> &v, - SIZE l_target, int queue_idx); -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/CUDA/Recomposition_5D_Float.cu b/src/mgard-x/DataRefactoring/CUDA/Recomposition_5D_Float.cu deleted file mode 100644 index 39bbf881fd..0000000000 --- a/src/mgard-x/DataRefactoring/CUDA/Recomposition_5D_Float.cu +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void recompose<5, float, CUDA>(Hierarchy<5, float, CUDA> &hierarchy, - SubArray<5, float, CUDA> &v, - SIZE l_target, int queue_idx); -template void - recompose_single<5, float, CUDA>(Hierarchy<5, float, CUDA> &hierarchy, - SubArray<5, float, CUDA> &v, SIZE l_target, - int queue_idx); -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/HIP/CMakeLists.txt b/src/mgard-x/DataRefactoring/HIP/CMakeLists.txt deleted file mode 100644 index 1a7736f707..0000000000 --- a/src/mgard-x/DataRefactoring/HIP/CMakeLists.txt +++ /dev/null @@ -1,24 +0,0 @@ -list(APPEND MGARD_X_HIP_SRC - ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_1D_Float.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_1D_Double.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_1D_Float.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_1D_Double.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_2D_Float.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_2D_Double.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_2D_Float.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_2D_Double.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_3D_Float.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_3D_Double.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_3D_Float.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_3D_Double.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_4D_Float.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_4D_Double.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_4D_Float.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_4D_Double.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_5D_Float.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_5D_Double.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_5D_Float.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_5D_Double.cpp - ) - -set(MGARD_X_HIP_SRC ${MGARD_X_HIP_SRC} PARENT_SCOPE) \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/HIP/Decomposition_1D_Double.cpp b/src/mgard-x/DataRefactoring/HIP/Decomposition_1D_Double.cpp deleted file mode 100644 index 2bd1056f70..0000000000 --- a/src/mgard-x/DataRefactoring/HIP/Decomposition_1D_Double.cpp +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void decompose<1, double, HIP>(Hierarchy<1, double, HIP> &hierarchy, - SubArray<1, double, HIP> &v, - SIZE l_target, int queue_idx); -template void - decompose_single<1, double, HIP>(Hierarchy<1, double, HIP> &hierarchy, - SubArray<1, double, HIP> &v, SIZE l_target, - int queue_idx); -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/HIP/Decomposition_1D_Float.cpp b/src/mgard-x/DataRefactoring/HIP/Decomposition_1D_Float.cpp deleted file mode 100644 index 4c9e6abecf..0000000000 --- a/src/mgard-x/DataRefactoring/HIP/Decomposition_1D_Float.cpp +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void decompose<1, float, HIP>(Hierarchy<1, float, HIP> &hierarchy, - SubArray<1, float, HIP> &v, - SIZE l_target, int queue_idx); -template void - decompose_single<1, float, HIP>(Hierarchy<1, float, HIP> &hierarchy, - SubArray<1, float, HIP> &v, SIZE l_target, - int queue_idx); -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/HIP/Decomposition_2D_Double.cpp b/src/mgard-x/DataRefactoring/HIP/Decomposition_2D_Double.cpp deleted file mode 100644 index 5ae8cdbefc..0000000000 --- a/src/mgard-x/DataRefactoring/HIP/Decomposition_2D_Double.cpp +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void decompose<2, double, HIP>(Hierarchy<2, double, HIP> &hierarchy, - SubArray<2, double, HIP> &v, - SIZE l_target, int queue_idx); -template void - decompose_single<2, double, HIP>(Hierarchy<2, double, HIP> &hierarchy, - SubArray<2, double, HIP> &v, SIZE l_target, - int queue_idx); -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/HIP/Decomposition_2D_Float.cpp b/src/mgard-x/DataRefactoring/HIP/Decomposition_2D_Float.cpp deleted file mode 100644 index c7d7166065..0000000000 --- a/src/mgard-x/DataRefactoring/HIP/Decomposition_2D_Float.cpp +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void decompose<2, float, HIP>(Hierarchy<2, float, HIP> &hierarchy, - SubArray<2, float, HIP> &v, - SIZE l_target, int queue_idx); -template void - decompose_single<2, float, HIP>(Hierarchy<2, float, HIP> &hierarchy, - SubArray<2, float, HIP> &v, SIZE l_target, - int queue_idx); -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/HIP/Decomposition_3D_Double.cpp b/src/mgard-x/DataRefactoring/HIP/Decomposition_3D_Double.cpp deleted file mode 100644 index 9f5b06257c..0000000000 --- a/src/mgard-x/DataRefactoring/HIP/Decomposition_3D_Double.cpp +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void decompose<3, double, HIP>(Hierarchy<3, double, HIP> &hierarchy, - SubArray<3, double, HIP> &v, - SIZE l_target, int queue_idx); -template void - decompose_single<3, double, HIP>(Hierarchy<3, double, HIP> &hierarchy, - SubArray<3, double, HIP> &v, SIZE l_target, - int queue_idx); -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/HIP/Decomposition_3D_Float.cpp b/src/mgard-x/DataRefactoring/HIP/Decomposition_3D_Float.cpp deleted file mode 100644 index ab710b8056..0000000000 --- a/src/mgard-x/DataRefactoring/HIP/Decomposition_3D_Float.cpp +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void decompose<3, float, HIP>(Hierarchy<3, float, HIP> &hierarchy, - SubArray<3, float, HIP> &v, - SIZE l_target, int queue_idx); -template void - decompose_single<3, float, HIP>(Hierarchy<3, float, HIP> &hierarchy, - SubArray<3, float, HIP> &v, SIZE l_target, - int queue_idx); -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/HIP/Decomposition_4D_Double.cpp b/src/mgard-x/DataRefactoring/HIP/Decomposition_4D_Double.cpp deleted file mode 100644 index 7d8c6d1cb1..0000000000 --- a/src/mgard-x/DataRefactoring/HIP/Decomposition_4D_Double.cpp +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void decompose<4, double, HIP>(Hierarchy<4, double, HIP> &hierarchy, - SubArray<4, double, HIP> &v, - SIZE l_target, int queue_idx); -template void - decompose_single<4, double, HIP>(Hierarchy<4, double, HIP> &hierarchy, - SubArray<4, double, HIP> &v, SIZE l_target, - int queue_idx); -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/HIP/Decomposition_4D_Float.cpp b/src/mgard-x/DataRefactoring/HIP/Decomposition_4D_Float.cpp deleted file mode 100644 index b58becf5e4..0000000000 --- a/src/mgard-x/DataRefactoring/HIP/Decomposition_4D_Float.cpp +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void decompose<4, float, HIP>(Hierarchy<4, float, HIP> &hierarchy, - SubArray<4, float, HIP> &v, - SIZE l_target, int queue_idx); -template void - decompose_single<4, float, HIP>(Hierarchy<4, float, HIP> &hierarchy, - SubArray<4, float, HIP> &v, SIZE l_target, - int queue_idx); -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/HIP/Decomposition_5D_Double.cpp b/src/mgard-x/DataRefactoring/HIP/Decomposition_5D_Double.cpp deleted file mode 100644 index 9f70b93bb0..0000000000 --- a/src/mgard-x/DataRefactoring/HIP/Decomposition_5D_Double.cpp +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void decompose<5, double, HIP>(Hierarchy<5, double, HIP> &hierarchy, - SubArray<5, double, HIP> &v, - SIZE l_target, int queue_idx); -template void - decompose_single<5, double, HIP>(Hierarchy<5, double, HIP> &hierarchy, - SubArray<5, double, HIP> &v, SIZE l_target, - int queue_idx); -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/HIP/Decomposition_5D_Float.cpp b/src/mgard-x/DataRefactoring/HIP/Decomposition_5D_Float.cpp deleted file mode 100644 index 0d8afc5972..0000000000 --- a/src/mgard-x/DataRefactoring/HIP/Decomposition_5D_Float.cpp +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void decompose<5, float, HIP>(Hierarchy<5, float, HIP> &hierarchy, - SubArray<5, float, HIP> &v, - SIZE l_target, int queue_idx); -template void - decompose_single<5, float, HIP>(Hierarchy<5, float, HIP> &hierarchy, - SubArray<5, float, HIP> &v, SIZE l_target, - int queue_idx); -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/HIP/Recomposition_1D_Double.cpp b/src/mgard-x/DataRefactoring/HIP/Recomposition_1D_Double.cpp deleted file mode 100644 index 412e1291f7..0000000000 --- a/src/mgard-x/DataRefactoring/HIP/Recomposition_1D_Double.cpp +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void recompose<1, double, HIP>(Hierarchy<1, double, HIP> &hierarchy, - SubArray<1, double, HIP> &v, - SIZE l_target, int queue_idx); -template void - recompose_single<1, double, HIP>(Hierarchy<1, double, HIP> &hierarchy, - SubArray<1, double, HIP> &v, SIZE l_target, - int queue_idx); -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/HIP/Recomposition_1D_Float.cpp b/src/mgard-x/DataRefactoring/HIP/Recomposition_1D_Float.cpp deleted file mode 100644 index 0421843007..0000000000 --- a/src/mgard-x/DataRefactoring/HIP/Recomposition_1D_Float.cpp +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void recompose<1, float, HIP>(Hierarchy<1, float, HIP> &hierarchy, - SubArray<1, float, HIP> &v, - SIZE l_target, int queue_idx); -template void - recompose_single<1, float, HIP>(Hierarchy<1, float, HIP> &hierarchy, - SubArray<1, float, HIP> &v, SIZE l_target, - int queue_idx); -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/HIP/Recomposition_2D_Double.cpp b/src/mgard-x/DataRefactoring/HIP/Recomposition_2D_Double.cpp deleted file mode 100644 index bfc2bfb572..0000000000 --- a/src/mgard-x/DataRefactoring/HIP/Recomposition_2D_Double.cpp +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void recompose<2, double, HIP>(Hierarchy<2, double, HIP> &hierarchy, - SubArray<2, double, HIP> &v, - SIZE l_target, int queue_idx); -template void - recompose_single<2, double, HIP>(Hierarchy<2, double, HIP> &hierarchy, - SubArray<2, double, HIP> &v, SIZE l_target, - int queue_idx); -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/HIP/Recomposition_2D_Float.cpp b/src/mgard-x/DataRefactoring/HIP/Recomposition_2D_Float.cpp deleted file mode 100644 index 984b5ada23..0000000000 --- a/src/mgard-x/DataRefactoring/HIP/Recomposition_2D_Float.cpp +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void recompose<2, float, HIP>(Hierarchy<2, float, HIP> &hierarchy, - SubArray<2, float, HIP> &v, - SIZE l_target, int queue_idx); -template void - recompose_single<2, float, HIP>(Hierarchy<2, float, HIP> &hierarchy, - SubArray<2, float, HIP> &v, SIZE l_target, - int queue_idx); -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/HIP/Recomposition_3D_Double.cpp b/src/mgard-x/DataRefactoring/HIP/Recomposition_3D_Double.cpp deleted file mode 100644 index 65433c2022..0000000000 --- a/src/mgard-x/DataRefactoring/HIP/Recomposition_3D_Double.cpp +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void recompose<3, double, HIP>(Hierarchy<3, double, HIP> &hierarchy, - SubArray<3, double, HIP> &v, - SIZE l_target, int queue_idx); -template void - recompose_single<3, double, HIP>(Hierarchy<3, double, HIP> &hierarchy, - SubArray<3, double, HIP> &v, SIZE l_target, - int queue_idx); -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/HIP/Recomposition_3D_Float.cpp b/src/mgard-x/DataRefactoring/HIP/Recomposition_3D_Float.cpp deleted file mode 100644 index f9de58d439..0000000000 --- a/src/mgard-x/DataRefactoring/HIP/Recomposition_3D_Float.cpp +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void recompose<3, float, HIP>(Hierarchy<3, float, HIP> &hierarchy, - SubArray<3, float, HIP> &v, - SIZE l_target, int queue_idx); -template void - recompose_single<3, float, HIP>(Hierarchy<3, float, HIP> &hierarchy, - SubArray<3, float, HIP> &v, SIZE l_target, - int queue_idx); -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/HIP/Recomposition_4D_Double.cpp b/src/mgard-x/DataRefactoring/HIP/Recomposition_4D_Double.cpp deleted file mode 100644 index 165fe8af5b..0000000000 --- a/src/mgard-x/DataRefactoring/HIP/Recomposition_4D_Double.cpp +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void recompose<4, double, HIP>(Hierarchy<4, double, HIP> &hierarchy, - SubArray<4, double, HIP> &v, - SIZE l_target, int queue_idx); -template void - recompose_single<4, double, HIP>(Hierarchy<4, double, HIP> &hierarchy, - SubArray<4, double, HIP> &v, SIZE l_target, - int queue_idx); -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/HIP/Recomposition_4D_Float.cpp b/src/mgard-x/DataRefactoring/HIP/Recomposition_4D_Float.cpp deleted file mode 100644 index 4d9e0cab1d..0000000000 --- a/src/mgard-x/DataRefactoring/HIP/Recomposition_4D_Float.cpp +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void recompose<4, float, HIP>(Hierarchy<4, float, HIP> &hierarchy, - SubArray<4, float, HIP> &v, - SIZE l_target, int queue_idx); -template void - recompose_single<4, float, HIP>(Hierarchy<4, float, HIP> &hierarchy, - SubArray<4, float, HIP> &v, SIZE l_target, - int queue_idx); -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/HIP/Recomposition_5D_Double.cpp b/src/mgard-x/DataRefactoring/HIP/Recomposition_5D_Double.cpp deleted file mode 100644 index 2f9ae8423a..0000000000 --- a/src/mgard-x/DataRefactoring/HIP/Recomposition_5D_Double.cpp +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void recompose<5, double, HIP>(Hierarchy<5, double, HIP> &hierarchy, - SubArray<5, double, HIP> &v, - SIZE l_target, int queue_idx); -template void - recompose_single<5, double, HIP>(Hierarchy<5, double, HIP> &hierarchy, - SubArray<5, double, HIP> &v, SIZE l_target, - int queue_idx); -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/HIP/Recomposition_5D_Float.cpp b/src/mgard-x/DataRefactoring/HIP/Recomposition_5D_Float.cpp deleted file mode 100644 index 73ed87b844..0000000000 --- a/src/mgard-x/DataRefactoring/HIP/Recomposition_5D_Float.cpp +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void recompose<5, float, HIP>(Hierarchy<5, float, HIP> &hierarchy, - SubArray<5, float, HIP> &v, - SIZE l_target, int queue_idx); -template void - recompose_single<5, float, HIP>(Hierarchy<5, float, HIP> &hierarchy, - SubArray<5, float, HIP> &v, SIZE l_target, - int queue_idx); -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/MultiDimension/CMakeLists.txt b/src/mgard-x/DataRefactoring/MultiDimension/CMakeLists.txt new file mode 100644 index 0000000000..cb89cb471d --- /dev/null +++ b/src/mgard-x/DataRefactoring/MultiDimension/CMakeLists.txt @@ -0,0 +1,9 @@ +add_subdirectory (Coefficient) +add_subdirectory (Correction) +add_subdirectory (CopyND) +MgardXGenerateSourceAllCombinations("Decompose") +MgardXGenerateSourceAllCombinations("Recompose") +set(MGARD_X_SERIAL_SRC ${MGARD_X_SERIAL_SRC} PARENT_SCOPE) +set(MGARD_X_CUDA_SRC ${MGARD_X_CUDA_SRC} PARENT_SCOPE) +set(MGARD_X_HIP_SRC ${MGARD_X_HIP_SRC} PARENT_SCOPE) +set(MGARD_X_SYCL_SRC ${MGARD_X_SYCL_SRC} PARENT_SCOPE) \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/MultiDimension/Coefficient/CMakeLists.txt b/src/mgard-x/DataRefactoring/MultiDimension/Coefficient/CMakeLists.txt new file mode 100644 index 0000000000..c6bdc5530b --- /dev/null +++ b/src/mgard-x/DataRefactoring/MultiDimension/Coefficient/CMakeLists.txt @@ -0,0 +1,8 @@ +MgardXGenerateSourceAllCombinations("CalcCoefficients3D") +MgardXGenerateSourceAllCombinations("CoefficientsRestore3D") +MgardXGenerateSourceAllCombinations("CalcCoefficientsND") +MgardXGenerateSourceAllCombinations("CoefficientsRestoreND") +set(MGARD_X_SERIAL_SRC ${MGARD_X_SERIAL_SRC} PARENT_SCOPE) +set(MGARD_X_CUDA_SRC ${MGARD_X_CUDA_SRC} PARENT_SCOPE) +set(MGARD_X_HIP_SRC ${MGARD_X_HIP_SRC} PARENT_SCOPE) +set(MGARD_X_SYCL_SRC ${MGARD_X_SYCL_SRC} PARENT_SCOPE) \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/MultiDimension/Coefficient/CalcCoefficients3D.cpp.in b/src/mgard-x/DataRefactoring/MultiDimension/Coefficient/CalcCoefficients3D.cpp.in new file mode 100644 index 0000000000..b45a2c4075 --- /dev/null +++ b/src/mgard-x/DataRefactoring/MultiDimension/Coefficient/CalcCoefficients3D.cpp.in @@ -0,0 +1,19 @@ +/* + * Copyright 2022, Oak Ridge National Laboratory. + * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs + * Author: Jieyang Chen (chenj3@ornl.gov) + * Date: March 17, 2022 + */ + +#include "mgard-x/DataRefactoring/MultiDimension/Coefficient/CalcCoefficients3D.hpp" +// clang-format off +namespace mgard_x { + +template void CalcCoefficients3D<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@>( + Hierarchy<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> &hierarchy, + SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> dinput, + SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> &doutput, SIZE l, + int queue_idx); + +} // namespace mgard_x +// clang-format on \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/MultiDimension/Coefficient/CalcCoefficientsND.cpp.in b/src/mgard-x/DataRefactoring/MultiDimension/Coefficient/CalcCoefficientsND.cpp.in new file mode 100644 index 0000000000..d183581175 --- /dev/null +++ b/src/mgard-x/DataRefactoring/MultiDimension/Coefficient/CalcCoefficientsND.cpp.in @@ -0,0 +1,20 @@ +/* + * Copyright 2022, Oak Ridge National Laboratory. + * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs + * Author: Jieyang Chen (chenj3@ornl.gov) + * Date: March 17, 2022 + */ + +#include "mgard-x/DataRefactoring/MultiDimension/Coefficient/CalcCoefficientsND.hpp" +// clang-format off +namespace mgard_x { + +template void CalcCoefficientsND<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@>( + Hierarchy<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> &hierarchy, + SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> dinput1, + SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> dinput2, + SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> &doutput, SIZE l, + int queue_idx); + +} // namespace mgard_x +// clang-format on \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/MultiDimension/Coefficient/CoefficientsRestore3D.cpp.in b/src/mgard-x/DataRefactoring/MultiDimension/Coefficient/CoefficientsRestore3D.cpp.in new file mode 100644 index 0000000000..c7fec64b5e --- /dev/null +++ b/src/mgard-x/DataRefactoring/MultiDimension/Coefficient/CoefficientsRestore3D.cpp.in @@ -0,0 +1,19 @@ +/* + * Copyright 2022, Oak Ridge National Laboratory. + * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs + * Author: Jieyang Chen (chenj3@ornl.gov) + * Date: March 17, 2022 + */ + +#include "mgard-x/DataRefactoring/MultiDimension/Coefficient/CoefficientsRestore3D.hpp" +// clang-format off +namespace mgard_x { + +template void CoefficientsRestore3D<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@>( + Hierarchy<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> &hierarchy, + SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> dinput, + SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> &doutput, SIZE l, + int queue_idx); + +} // namespace mgard_x +// clang-format on \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/MultiDimension/Coefficient/CoefficientsRestoreND.cpp.in b/src/mgard-x/DataRefactoring/MultiDimension/Coefficient/CoefficientsRestoreND.cpp.in new file mode 100644 index 0000000000..6d6da0248e --- /dev/null +++ b/src/mgard-x/DataRefactoring/MultiDimension/Coefficient/CoefficientsRestoreND.cpp.in @@ -0,0 +1,20 @@ +/* + * Copyright 2022, Oak Ridge National Laboratory. + * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs + * Author: Jieyang Chen (chenj3@ornl.gov) + * Date: March 17, 2022 + */ + +#include "mgard-x/DataRefactoring/MultiDimension/Coefficient/CoefficientsRestoreND.hpp" +// clang-format off +namespace mgard_x { + +template void CoefficientsRestoreND<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@>( + Hierarchy<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> &hierarchy, + SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> dinput1, + SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> dinput2, + SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> &doutput, SIZE l, + int queue_idx); + +} // namespace mgard_x +// clang-format on \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/MultiDimension/CopyND/AddND.cpp.in b/src/mgard-x/DataRefactoring/MultiDimension/CopyND/AddND.cpp.in new file mode 100644 index 0000000000..642c34f3bb --- /dev/null +++ b/src/mgard-x/DataRefactoring/MultiDimension/CopyND/AddND.cpp.in @@ -0,0 +1,17 @@ +/* + * Copyright 2022, Oak Ridge National Laboratory. + * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs + * Author: Jieyang Chen (chenj3@ornl.gov) + * Date: March 17, 2022 + */ + +#include "mgard-x/DataRefactoring/MultiDimension/CopyND/AddND.hpp" +// clang-format off +namespace mgard_x { + +template void AddND<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@>( + SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> dinput, + SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> &doutput, int queue_idx); + +} // namespace mgard_x +// clang-format on \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/MultiDimension/CopyND/CMakeLists.txt b/src/mgard-x/DataRefactoring/MultiDimension/CopyND/CMakeLists.txt new file mode 100644 index 0000000000..305759b565 --- /dev/null +++ b/src/mgard-x/DataRefactoring/MultiDimension/CopyND/CMakeLists.txt @@ -0,0 +1,7 @@ +MgardXGenerateSourceAllCombinations("CopyND") +MgardXGenerateSourceAllCombinations("AddND") +MgardXGenerateSourceAllCombinations("SubtractND") +set(MGARD_X_SERIAL_SRC ${MGARD_X_SERIAL_SRC} PARENT_SCOPE) +set(MGARD_X_CUDA_SRC ${MGARD_X_CUDA_SRC} PARENT_SCOPE) +set(MGARD_X_HIP_SRC ${MGARD_X_HIP_SRC} PARENT_SCOPE) +set(MGARD_X_SYCL_SRC ${MGARD_X_SYCL_SRC} PARENT_SCOPE) \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/MultiDimension/CopyND/CopyND.cpp.in b/src/mgard-x/DataRefactoring/MultiDimension/CopyND/CopyND.cpp.in new file mode 100644 index 0000000000..1050fdd6ee --- /dev/null +++ b/src/mgard-x/DataRefactoring/MultiDimension/CopyND/CopyND.cpp.in @@ -0,0 +1,17 @@ +/* + * Copyright 2022, Oak Ridge National Laboratory. + * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs + * Author: Jieyang Chen (chenj3@ornl.gov) + * Date: March 17, 2022 + */ + +#include "mgard-x/DataRefactoring/MultiDimension/CopyND/CopyND.hpp" +// clang-format off +namespace mgard_x { + +template void CopyND<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@>( + SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> dinput, + SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> &doutput, int queue_idx); + +} // namespace mgard_x +// clang-format on \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/MultiDimension/CopyND/SubtractND.cpp.in b/src/mgard-x/DataRefactoring/MultiDimension/CopyND/SubtractND.cpp.in new file mode 100644 index 0000000000..b63b081af3 --- /dev/null +++ b/src/mgard-x/DataRefactoring/MultiDimension/CopyND/SubtractND.cpp.in @@ -0,0 +1,17 @@ +/* + * Copyright 2022, Oak Ridge National Laboratory. + * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs + * Author: Jieyang Chen (chenj3@ornl.gov) + * Date: March 17, 2022 + */ + +#include "mgard-x/DataRefactoring/MultiDimension/CopyND/SubtractND.hpp" +// clang-format off +namespace mgard_x { + +template void SubtractND<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@>( + SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> dinput, + SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> &doutput, int queue_idx); + +} // namespace mgard_x +// clang-format on \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/MultiDimension/Correction/CMakeLists.txt b/src/mgard-x/DataRefactoring/MultiDimension/Correction/CMakeLists.txt new file mode 100644 index 0000000000..9c9fe8f704 --- /dev/null +++ b/src/mgard-x/DataRefactoring/MultiDimension/Correction/CMakeLists.txt @@ -0,0 +1,6 @@ +MgardXGenerateSourceAllCombinations("CalcCorrection3D") +MgardXGenerateSourceAllCombinations("CalcCorrectionND") +set(MGARD_X_SERIAL_SRC ${MGARD_X_SERIAL_SRC} PARENT_SCOPE) +set(MGARD_X_CUDA_SRC ${MGARD_X_CUDA_SRC} PARENT_SCOPE) +set(MGARD_X_HIP_SRC ${MGARD_X_HIP_SRC} PARENT_SCOPE) +set(MGARD_X_SYCL_SRC ${MGARD_X_SYCL_SRC} PARENT_SCOPE) \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/MultiDimension/Correction/CalcCorrection3D.cpp.in b/src/mgard-x/DataRefactoring/MultiDimension/Correction/CalcCorrection3D.cpp.in new file mode 100644 index 0000000000..ecf79d9b50 --- /dev/null +++ b/src/mgard-x/DataRefactoring/MultiDimension/Correction/CalcCorrection3D.cpp.in @@ -0,0 +1,19 @@ +/* + * Copyright 2022, Oak Ridge National Laboratory. + * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs + * Author: Jieyang Chen (chenj3@ornl.gov) + * Date: March 17, 2022 + */ + +#include "mgard-x/DataRefactoring/MultiDimension/Correction/CalcCorrection3D.hpp" +// clang-format off +namespace mgard_x { + +template void CalcCorrection3D<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@>( + Hierarchy<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> &hierarchy, + SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> dcoeff, + SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> &dcorrection, SIZE l, + int queue_idx); + +} // namespace mgard_x +// clang-format on \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/MultiDimension/Correction/CalcCorrectionND.cpp.in b/src/mgard-x/DataRefactoring/MultiDimension/Correction/CalcCorrectionND.cpp.in new file mode 100644 index 0000000000..a939ce52e2 --- /dev/null +++ b/src/mgard-x/DataRefactoring/MultiDimension/Correction/CalcCorrectionND.cpp.in @@ -0,0 +1,19 @@ +/* + * Copyright 2022, Oak Ridge National Laboratory. + * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs + * Author: Jieyang Chen (chenj3@ornl.gov) + * Date: March 17, 2022 + */ + +#include "mgard-x/DataRefactoring/MultiDimension/Correction/CalcCorrectionND.hpp" +// clang-format off +namespace mgard_x { + +template void CalcCorrectionND<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@>( + Hierarchy<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> &hierarchy, + SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> dcoeff, + SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> &dcorrection, SIZE l, + int queue_idx); + +} // namespace mgard_x +// clang-format on \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/MultiDimension/Decompose.cpp.in b/src/mgard-x/DataRefactoring/MultiDimension/Decompose.cpp.in new file mode 100644 index 0000000000..39fedf66cd --- /dev/null +++ b/src/mgard-x/DataRefactoring/MultiDimension/Decompose.cpp.in @@ -0,0 +1,17 @@ +/* + * Copyright 2022, Oak Ridge National Laboratory. + * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs + * Author: Jieyang Chen (chenj3@ornl.gov) + * Date: March 17, 2022 + */ + +#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" +// clang-format off +namespace mgard_x { + +template void decompose<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@>( + Hierarchy<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> &hierarchy, + SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> &v, SIZE l_target, + int queue_idx); +} // namespace mgard_x +// clang-format on \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/MultiDimension/Recompose.cpp.in b/src/mgard-x/DataRefactoring/MultiDimension/Recompose.cpp.in new file mode 100644 index 0000000000..8f476aaac4 --- /dev/null +++ b/src/mgard-x/DataRefactoring/MultiDimension/Recompose.cpp.in @@ -0,0 +1,17 @@ +/* + * Copyright 2022, Oak Ridge National Laboratory. + * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs + * Author: Jieyang Chen (chenj3@ornl.gov) + * Date: March 17, 2022 + */ + +#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" +// clang-format off +namespace mgard_x { + +template void recompose<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@>( + Hierarchy<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> &hierarchy, + SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> &v, SIZE l_target, + int queue_idx); +} // namespace mgard_x +// clang-format on \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/Serial/CMakeLists.txt b/src/mgard-x/DataRefactoring/Serial/CMakeLists.txt deleted file mode 100644 index cd1ed0e623..0000000000 --- a/src/mgard-x/DataRefactoring/Serial/CMakeLists.txt +++ /dev/null @@ -1,24 +0,0 @@ -list(APPEND MGARD_X_SERIAL_SRC - ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_1D_Float.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_1D_Double.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_1D_Float.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_1D_Double.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_2D_Float.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_2D_Double.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_2D_Float.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_2D_Double.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_3D_Float.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_3D_Double.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_3D_Float.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_3D_Double.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_4D_Float.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_4D_Double.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_4D_Float.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_4D_Double.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_5D_Float.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/Decomposition_5D_Double.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_5D_Float.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/Recomposition_5D_Double.cpp - ) - -set(MGARD_X_SERIAL_SRC ${MGARD_X_SERIAL_SRC} PARENT_SCOPE) \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/Serial/Decomposition_1D_Double.cpp b/src/mgard-x/DataRefactoring/Serial/Decomposition_1D_Double.cpp deleted file mode 100644 index 6e21991762..0000000000 --- a/src/mgard-x/DataRefactoring/Serial/Decomposition_1D_Double.cpp +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void - decompose<1, double, Serial>(Hierarchy<1, double, Serial> &hierarchy, - SubArray<1, double, Serial> &v, SIZE l_target, - int queue_idx); -template void - decompose_single<1, double, Serial>(Hierarchy<1, double, Serial> &hierarchy, - SubArray<1, double, Serial> &v, - SIZE l_target, int queue_idx); -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/Serial/Decomposition_1D_Float.cpp b/src/mgard-x/DataRefactoring/Serial/Decomposition_1D_Float.cpp deleted file mode 100644 index 28a7d34959..0000000000 --- a/src/mgard-x/DataRefactoring/Serial/Decomposition_1D_Float.cpp +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void - decompose<1, float, Serial>(Hierarchy<1, float, Serial> &hierarchy, - SubArray<1, float, Serial> &v, SIZE l_target, - int queue_idx); -template void - decompose_single<1, float, Serial>(Hierarchy<1, float, Serial> &hierarchy, - SubArray<1, float, Serial> &v, - SIZE l_target, int queue_idx); -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/Serial/Decomposition_2D_Double.cpp b/src/mgard-x/DataRefactoring/Serial/Decomposition_2D_Double.cpp deleted file mode 100644 index 4885cac659..0000000000 --- a/src/mgard-x/DataRefactoring/Serial/Decomposition_2D_Double.cpp +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void - decompose<2, double, Serial>(Hierarchy<2, double, Serial> &hierarchy, - SubArray<2, double, Serial> &v, SIZE l_target, - int queue_idx); -template void - decompose_single<2, double, Serial>(Hierarchy<2, double, Serial> &hierarchy, - SubArray<2, double, Serial> &v, - SIZE l_target, int queue_idx); -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/Serial/Decomposition_2D_Float.cpp b/src/mgard-x/DataRefactoring/Serial/Decomposition_2D_Float.cpp deleted file mode 100644 index 1477d1bc23..0000000000 --- a/src/mgard-x/DataRefactoring/Serial/Decomposition_2D_Float.cpp +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void - decompose<2, float, Serial>(Hierarchy<2, float, Serial> &hierarchy, - SubArray<2, float, Serial> &v, SIZE l_target, - int queue_idx); -template void - decompose_single<2, float, Serial>(Hierarchy<2, float, Serial> &hierarchy, - SubArray<2, float, Serial> &v, - SIZE l_target, int queue_idx); -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/Serial/Decomposition_3D_Double.cpp b/src/mgard-x/DataRefactoring/Serial/Decomposition_3D_Double.cpp deleted file mode 100644 index 49de330b98..0000000000 --- a/src/mgard-x/DataRefactoring/Serial/Decomposition_3D_Double.cpp +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void - decompose<3, double, Serial>(Hierarchy<3, double, Serial> &hierarchy, - SubArray<3, double, Serial> &v, SIZE l_target, - int queue_idx); -template void - decompose_single<3, double, Serial>(Hierarchy<3, double, Serial> &hierarchy, - SubArray<3, double, Serial> &v, - SIZE l_target, int queue_idx); -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/Serial/Decomposition_3D_Float.cpp b/src/mgard-x/DataRefactoring/Serial/Decomposition_3D_Float.cpp deleted file mode 100644 index 41a35a8a1f..0000000000 --- a/src/mgard-x/DataRefactoring/Serial/Decomposition_3D_Float.cpp +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void - decompose<3, float, Serial>(Hierarchy<3, float, Serial> &hierarchy, - SubArray<3, float, Serial> &v, SIZE l_target, - int queue_idx); -template void - decompose_single<3, float, Serial>(Hierarchy<3, float, Serial> &hierarchy, - SubArray<3, float, Serial> &v, - SIZE l_target, int queue_idx); -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/Serial/Decomposition_4D_Double.cpp b/src/mgard-x/DataRefactoring/Serial/Decomposition_4D_Double.cpp deleted file mode 100644 index f59f81a53e..0000000000 --- a/src/mgard-x/DataRefactoring/Serial/Decomposition_4D_Double.cpp +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void - decompose<4, double, Serial>(Hierarchy<4, double, Serial> &hierarchy, - SubArray<4, double, Serial> &v, SIZE l_target, - int queue_idx); -template void - decompose_single<4, double, Serial>(Hierarchy<4, double, Serial> &hierarchy, - SubArray<4, double, Serial> &v, - SIZE l_target, int queue_idx); -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/Serial/Decomposition_4D_Float.cpp b/src/mgard-x/DataRefactoring/Serial/Decomposition_4D_Float.cpp deleted file mode 100644 index b0bf406e37..0000000000 --- a/src/mgard-x/DataRefactoring/Serial/Decomposition_4D_Float.cpp +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void - decompose<4, float, Serial>(Hierarchy<4, float, Serial> &hierarchy, - SubArray<4, float, Serial> &v, SIZE l_target, - int queue_idx); -template void - decompose_single<4, float, Serial>(Hierarchy<4, float, Serial> &hierarchy, - SubArray<4, float, Serial> &v, - SIZE l_target, int queue_idx); -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/Serial/Decomposition_5D_Double.cpp b/src/mgard-x/DataRefactoring/Serial/Decomposition_5D_Double.cpp deleted file mode 100644 index 439d1e9a02..0000000000 --- a/src/mgard-x/DataRefactoring/Serial/Decomposition_5D_Double.cpp +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void - decompose<5, double, Serial>(Hierarchy<5, double, Serial> &hierarchy, - SubArray<5, double, Serial> &v, SIZE l_target, - int queue_idx); -template void - decompose_single<5, double, Serial>(Hierarchy<5, double, Serial> &hierarchy, - SubArray<5, double, Serial> &v, - SIZE l_target, int queue_idx); -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/Serial/Decomposition_5D_Float.cpp b/src/mgard-x/DataRefactoring/Serial/Decomposition_5D_Float.cpp deleted file mode 100644 index d4ccdfd8e9..0000000000 --- a/src/mgard-x/DataRefactoring/Serial/Decomposition_5D_Float.cpp +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void - decompose<5, float, Serial>(Hierarchy<5, float, Serial> &hierarchy, - SubArray<5, float, Serial> &v, SIZE l_target, - int queue_idx); -template void - decompose_single<5, float, Serial>(Hierarchy<5, float, Serial> &hierarchy, - SubArray<5, float, Serial> &v, - SIZE l_target, int queue_idx); -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/Serial/Recomposition_1D_Double.cpp b/src/mgard-x/DataRefactoring/Serial/Recomposition_1D_Double.cpp deleted file mode 100644 index 4a71227ea1..0000000000 --- a/src/mgard-x/DataRefactoring/Serial/Recomposition_1D_Double.cpp +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void - recompose<1, double, Serial>(Hierarchy<1, double, Serial> &hierarchy, - SubArray<1, double, Serial> &v, SIZE l_target, - int queue_idx); -template void - recompose_single<1, double, Serial>(Hierarchy<1, double, Serial> &hierarchy, - SubArray<1, double, Serial> &v, - SIZE l_target, int queue_idx); -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/Serial/Recomposition_1D_Float.cpp b/src/mgard-x/DataRefactoring/Serial/Recomposition_1D_Float.cpp deleted file mode 100644 index 4ccd960f83..0000000000 --- a/src/mgard-x/DataRefactoring/Serial/Recomposition_1D_Float.cpp +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void - recompose<1, float, Serial>(Hierarchy<1, float, Serial> &hierarchy, - SubArray<1, float, Serial> &v, SIZE l_target, - int queue_idx); -template void - recompose_single<1, float, Serial>(Hierarchy<1, float, Serial> &hierarchy, - SubArray<1, float, Serial> &v, - SIZE l_target, int queue_idx); -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/Serial/Recomposition_2D_Double.cpp b/src/mgard-x/DataRefactoring/Serial/Recomposition_2D_Double.cpp deleted file mode 100644 index 16365c6a1d..0000000000 --- a/src/mgard-x/DataRefactoring/Serial/Recomposition_2D_Double.cpp +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void - recompose<2, double, Serial>(Hierarchy<2, double, Serial> &hierarchy, - SubArray<2, double, Serial> &v, SIZE l_target, - int queue_idx); -template void - recompose_single<2, double, Serial>(Hierarchy<2, double, Serial> &hierarchy, - SubArray<2, double, Serial> &v, - SIZE l_target, int queue_idx); -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/Serial/Recomposition_2D_Float.cpp b/src/mgard-x/DataRefactoring/Serial/Recomposition_2D_Float.cpp deleted file mode 100644 index 7f1257c497..0000000000 --- a/src/mgard-x/DataRefactoring/Serial/Recomposition_2D_Float.cpp +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void - recompose<2, float, Serial>(Hierarchy<2, float, Serial> &hierarchy, - SubArray<2, float, Serial> &v, SIZE l_target, - int queue_idx); -template void - recompose_single<2, float, Serial>(Hierarchy<2, float, Serial> &hierarchy, - SubArray<2, float, Serial> &v, - SIZE l_target, int queue_idx); -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/Serial/Recomposition_3D_Double.cpp b/src/mgard-x/DataRefactoring/Serial/Recomposition_3D_Double.cpp deleted file mode 100644 index 9af949eda2..0000000000 --- a/src/mgard-x/DataRefactoring/Serial/Recomposition_3D_Double.cpp +++ /dev/null @@ -1,25 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void - recompose<3, double, Serial>(Hierarchy<3, double, Serial> &hierarchy, - SubArray<3, double, Serial> &v, SIZE l_target, - int queue_idx); -template void - recompose_single<3, double, Serial>(Hierarchy<3, double, Serial> &hierarchy, - SubArray<3, double, Serial> &v, - SIZE l_target, int queue_idx); - -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/Serial/Recomposition_3D_Float.cpp b/src/mgard-x/DataRefactoring/Serial/Recomposition_3D_Float.cpp deleted file mode 100644 index 039087a200..0000000000 --- a/src/mgard-x/DataRefactoring/Serial/Recomposition_3D_Float.cpp +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void - recompose<3, float, Serial>(Hierarchy<3, float, Serial> &hierarchy, - SubArray<3, float, Serial> &v, SIZE l_target, - int queue_idx); -template void - recompose_single<3, float, Serial>(Hierarchy<3, float, Serial> &hierarchy, - SubArray<3, float, Serial> &v, - SIZE l_target, int queue_idx); -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/Serial/Recomposition_4D_Double.cpp b/src/mgard-x/DataRefactoring/Serial/Recomposition_4D_Double.cpp deleted file mode 100644 index 10bd8c9807..0000000000 --- a/src/mgard-x/DataRefactoring/Serial/Recomposition_4D_Double.cpp +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void - recompose<4, double, Serial>(Hierarchy<4, double, Serial> &hierarchy, - SubArray<4, double, Serial> &v, SIZE l_target, - int queue_idx); -template void - recompose_single<4, double, Serial>(Hierarchy<4, double, Serial> &hierarchy, - SubArray<4, double, Serial> &v, - SIZE l_target, int queue_idx); -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/Serial/Recomposition_4D_Float.cpp b/src/mgard-x/DataRefactoring/Serial/Recomposition_4D_Float.cpp deleted file mode 100644 index 70e147dfdc..0000000000 --- a/src/mgard-x/DataRefactoring/Serial/Recomposition_4D_Float.cpp +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void - recompose<4, float, Serial>(Hierarchy<4, float, Serial> &hierarchy, - SubArray<4, float, Serial> &v, SIZE l_target, - int queue_idx); -template void - recompose_single<4, float, Serial>(Hierarchy<4, float, Serial> &hierarchy, - SubArray<4, float, Serial> &v, - SIZE l_target, int queue_idx); -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/Serial/Recomposition_5D_Double.cpp b/src/mgard-x/DataRefactoring/Serial/Recomposition_5D_Double.cpp deleted file mode 100644 index 751d51c27c..0000000000 --- a/src/mgard-x/DataRefactoring/Serial/Recomposition_5D_Double.cpp +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void - recompose<5, double, Serial>(Hierarchy<5, double, Serial> &hierarchy, - SubArray<5, double, Serial> &v, SIZE l_target, - int queue_idx); -template void - recompose_single<5, double, Serial>(Hierarchy<5, double, Serial> &hierarchy, - SubArray<5, double, Serial> &v, - SIZE l_target, int queue_idx); -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/Serial/Recomposition_5D_Float.cpp b/src/mgard-x/DataRefactoring/Serial/Recomposition_5D_Float.cpp deleted file mode 100644 index 513e797869..0000000000 --- a/src/mgard-x/DataRefactoring/Serial/Recomposition_5D_Float.cpp +++ /dev/null @@ -1,24 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/DataRefactoring/MultiDimension/DataRefactoring.hpp" -#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" - -#include - -#include -namespace mgard_x { - -template void - recompose<5, float, Serial>(Hierarchy<5, float, Serial> &hierarchy, - SubArray<5, float, Serial> &v, SIZE l_target, - int queue_idx); -template void - recompose_single<5, float, Serial>(Hierarchy<5, float, Serial> &hierarchy, - SubArray<5, float, Serial> &v, - SIZE l_target, int queue_idx); -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/SingleDimension/CMakeLists.txt b/src/mgard-x/DataRefactoring/SingleDimension/CMakeLists.txt new file mode 100644 index 0000000000..258dd0b00f --- /dev/null +++ b/src/mgard-x/DataRefactoring/SingleDimension/CMakeLists.txt @@ -0,0 +1,8 @@ +add_subdirectory(Coefficient) +add_subdirectory(Correction) +MgardXGenerateSourceAllCombinations("Decompose") +MgardXGenerateSourceAllCombinations("Recompose") +set(MGARD_X_SERIAL_SRC ${MGARD_X_SERIAL_SRC} PARENT_SCOPE) +set(MGARD_X_CUDA_SRC ${MGARD_X_CUDA_SRC} PARENT_SCOPE) +set(MGARD_X_HIP_SRC ${MGARD_X_HIP_SRC} PARENT_SCOPE) +set(MGARD_X_SYCL_SRC ${MGARD_X_SYCL_SRC} PARENT_SCOPE) \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/SingleDimension/Coefficient/CMakeLists.txt b/src/mgard-x/DataRefactoring/SingleDimension/Coefficient/CMakeLists.txt new file mode 100644 index 0000000000..7ca1850c9f --- /dev/null +++ b/src/mgard-x/DataRefactoring/SingleDimension/Coefficient/CMakeLists.txt @@ -0,0 +1,6 @@ +MgardXGenerateSourceAllCombinations("CalcCoefficients") +MgardXGenerateSourceAllCombinations("CoefficientsRestore") +set(MGARD_X_SERIAL_SRC ${MGARD_X_SERIAL_SRC} PARENT_SCOPE) +set(MGARD_X_CUDA_SRC ${MGARD_X_CUDA_SRC} PARENT_SCOPE) +set(MGARD_X_HIP_SRC ${MGARD_X_HIP_SRC} PARENT_SCOPE) +set(MGARD_X_SYCL_SRC ${MGARD_X_SYCL_SRC} PARENT_SCOPE) \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/SingleDimension/Coefficient/CalcCoefficients.cpp.in b/src/mgard-x/DataRefactoring/SingleDimension/Coefficient/CalcCoefficients.cpp.in new file mode 100644 index 0000000000..a14844d10b --- /dev/null +++ b/src/mgard-x/DataRefactoring/SingleDimension/Coefficient/CalcCoefficients.cpp.in @@ -0,0 +1,19 @@ +/* + * Copyright 2022, Oak Ridge National Laboratory. + * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs + * Author: Jieyang Chen (chenj3@ornl.gov) + * Date: March 17, 2022 + */ + +#include "mgard-x/DataRefactoring/SingleDimension/Coefficient/CalcCoefficients.hpp" +// clang-format off +namespace mgard_x { + +template void CalcCoefficients<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@>( + DIM current_dim, SubArray<1, @DATA_TYPE@, @DEVICE_TYPE@> ratio, + SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> v, + SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> coarse, + SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> coeff, int queue_idx); + +} // namespace mgard_x +// clang-format on \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/SingleDimension/Coefficient/CoefficientsRestore.cpp.in b/src/mgard-x/DataRefactoring/SingleDimension/Coefficient/CoefficientsRestore.cpp.in new file mode 100644 index 0000000000..b2f2de4a06 --- /dev/null +++ b/src/mgard-x/DataRefactoring/SingleDimension/Coefficient/CoefficientsRestore.cpp.in @@ -0,0 +1,19 @@ +/* + * Copyright 2022, Oak Ridge National Laboratory. + * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs + * Author: Jieyang Chen (chenj3@ornl.gov) + * Date: March 17, 2022 + */ + +#include "mgard-x/DataRefactoring/SingleDimension/Coefficient/CoefficientsRestore.hpp" +// clang-format off +namespace mgard_x { + +template void CoefficientsRestore<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@>( + DIM current_dim, SubArray<1, @DATA_TYPE@, @DEVICE_TYPE@> ratio, + SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> v, + SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> coarse, + SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> coeff, int queue_idx); + +} // namespace mgard_x +// clang-format on \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/SingleDimension/Correction/CMakeLists.txt b/src/mgard-x/DataRefactoring/SingleDimension/Correction/CMakeLists.txt new file mode 100644 index 0000000000..c80531852e --- /dev/null +++ b/src/mgard-x/DataRefactoring/SingleDimension/Correction/CMakeLists.txt @@ -0,0 +1,5 @@ +MgardXGenerateSourceAllCombinations("CalcCorrection") +set(MGARD_X_SERIAL_SRC ${MGARD_X_SERIAL_SRC} PARENT_SCOPE) +set(MGARD_X_CUDA_SRC ${MGARD_X_CUDA_SRC} PARENT_SCOPE) +set(MGARD_X_HIP_SRC ${MGARD_X_HIP_SRC} PARENT_SCOPE) +set(MGARD_X_SYCL_SRC ${MGARD_X_SYCL_SRC} PARENT_SCOPE) \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/SingleDimension/Correction/CalcCorrection.cpp.in b/src/mgard-x/DataRefactoring/SingleDimension/Correction/CalcCorrection.cpp.in new file mode 100644 index 0000000000..f357e86c95 --- /dev/null +++ b/src/mgard-x/DataRefactoring/SingleDimension/Correction/CalcCorrection.cpp.in @@ -0,0 +1,19 @@ +/* + * Copyright 2022, Oak Ridge National Laboratory. + * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs + * Author: Jieyang Chen (chenj3@ornl.gov) + * Date: March 17, 2022 + */ + +#include "mgard-x/DataRefactoring/SingleDimension/Correction/CalcCorrection.hpp" +// clang-format off +namespace mgard_x { + +template void CalcCorrection<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@>( + Hierarchy<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> &hierarchy, + SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> &coeff, + SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> &correction, + SIZE curr_dim, SIZE l, int queue_idx); + +} // namespace mgard_x +// clang-format off \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/SingleDimension/Decompose.cpp.in b/src/mgard-x/DataRefactoring/SingleDimension/Decompose.cpp.in new file mode 100644 index 0000000000..a40de0f84c --- /dev/null +++ b/src/mgard-x/DataRefactoring/SingleDimension/Decompose.cpp.in @@ -0,0 +1,17 @@ +/* + * Copyright 2022, Oak Ridge National Laboratory. + * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs + * Author: Jieyang Chen (chenj3@ornl.gov) + * Date: March 17, 2022 + */ + +#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" +// clang-format off +namespace mgard_x { + +template void decompose_single<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@>( + Hierarchy<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> &hierarchy, + SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> &v, SIZE l_target, + int queue_idx); +} // namespace mgard_x +// clang-format on \ No newline at end of file diff --git a/src/mgard-x/DataRefactoring/SingleDimension/Recompose.cpp.in b/src/mgard-x/DataRefactoring/SingleDimension/Recompose.cpp.in new file mode 100644 index 0000000000..891956a814 --- /dev/null +++ b/src/mgard-x/DataRefactoring/SingleDimension/Recompose.cpp.in @@ -0,0 +1,17 @@ +/* + * Copyright 2022, Oak Ridge National Laboratory. + * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs + * Author: Jieyang Chen (chenj3@ornl.gov) + * Date: March 17, 2022 + */ + +#include "mgard-x/DataRefactoring/SingleDimension/DataRefactoring.hpp" +// clang-format off +namespace mgard_x { + +template void recompose_single<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@>( + Hierarchy<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> &hierarchy, + SubArray<@NUM_DIM@, @DATA_TYPE@, @DEVICE_TYPE@> &v, SIZE l_target, + int queue_idx); +} // namespace mgard_x +// clang-format on \ No newline at end of file diff --git a/src/mgard-x/Executables/mgard-x-autotuner.cpp b/src/mgard-x/Executables/mgard-x-autotuner.cpp index 104c124dc7..934f094e6f 100644 --- a/src/mgard-x/Executables/mgard-x-autotuner.cpp +++ b/src/mgard-x/Executables/mgard-x-autotuner.cpp @@ -68,13 +68,15 @@ int launch_compress(mgard_x::DIM D, enum mgard_x::data_type dtype, void autotuning(enum mgard_x::device_type dev_type, std::vector shape) { - if (dev_type == mgard_x::device_type::Serial) { + if (dev_type == mgard_x::device_type::SERIAL) { std::cout << mgard_x::log::log_info - << "Start autotuning MGARD-X::Serial.\n"; + << "Start autotuning MGARD-X::SERIAL.\n"; } else if (dev_type == mgard_x::device_type::CUDA) { std::cout << mgard_x::log::log_info << "Start auto tuning MGARD-X::CUDA.\n"; } else if (dev_type == mgard_x::device_type::HIP) { std::cout << mgard_x::log::log_info << "Start auto tuning MGARD-X::HIP.\n"; + } else if (dev_type == mgard_x::device_type::SYCL) { + std::cout << mgard_x::log::log_info << "Start auto tuning MGARD-X::SYCL.\n"; } mgard_x::BeginAutoTuning(dev_type); std::cout << mgard_x::log::log_info @@ -88,13 +90,15 @@ void autotuning(enum mgard_x::device_type dev_type, dev_type); std::cout << "Done.\n"; mgard_x::EndAutoTuning(dev_type); - if (dev_type == mgard_x::device_type::Serial) { + if (dev_type == mgard_x::device_type::SERIAL) { std::cout << mgard_x::log::log_info - << "Done auto tuning MGARD-X::Serial.\n"; + << "Done auto tuning MGARD-X::SERIAL.\n"; } else if (dev_type == mgard_x::device_type::CUDA) { std::cout << mgard_x::log::log_info << "Done auto tuning MGARD-X::CUDA.\n"; } else if (dev_type == mgard_x::device_type::HIP) { std::cout << mgard_x::log::log_info << "Done auto tuning MGARD-X::HIP.\n"; + } else if (dev_type == mgard_x::device_type::SYCL) { + std::cout << mgard_x::log::log_info << "Done auto tuning MGARD-X::SYCL.\n"; } std::cout << mgard_x::log::log_info << "Please recompile MGARD-X to make the auto tuning effective.\n"; @@ -105,7 +109,7 @@ void print_usage_message(std::string error) { std::cout << mgard_x::log::log_err << error << std::endl; } printf("* Full automatic mode: run 'mgard-x-autotuner' without arguments\n\ -* For a specific backend: run 'mgard-x-autotuner -d '\n\ +* For a specific backend: run 'mgard-x-autotuner -d '\n\ * For a specific input size on a specific backend: run 'mgard-x-autotuner -d -n [dim1] [dim2] ... [dimN]'\n"); exit(0); } @@ -186,14 +190,17 @@ int main(int argc, char *argv[]) { std::cout << "\n"; std::string dev = get_arg(argc, argv, "-d"); if (dev.compare("serial") == 0) { - dev_type = mgard_x::device_type::Serial; - std::cout << mgard_x::log::log_info << "device type: Serial\n"; + dev_type = mgard_x::device_type::SERIAL; + std::cout << mgard_x::log::log_info << "device type: SERIAL\n"; } else if (dev.compare("cuda") == 0) { dev_type = mgard_x::device_type::CUDA; std::cout << mgard_x::log::log_info << "device type: CUDA\n"; } else if (dev.compare("hip") == 0) { dev_type = mgard_x::device_type::HIP; std::cout << mgard_x::log::log_info << "device type: HIP\n"; + } else if (dev.compare("sycl") == 0) { + dev_type = mgard_x::device_type::SYCL; + std::cout << mgard_x::log::log_info << "device type: SYCL\n"; } else { std::cout << "wrong device type.\n"; exit(-1); @@ -203,14 +210,17 @@ int main(int argc, char *argv[]) { std::vector shape({513, 513, 513}); std::string dev = get_arg(argc, argv, "-d"); if (dev.compare("serial") == 0) { - dev_type = mgard_x::device_type::Serial; - std::cout << mgard_x::log::log_info << "device type: Serial\n"; + dev_type = mgard_x::device_type::SERIAL; + std::cout << mgard_x::log::log_info << "device type: SERIAL\n"; } else if (dev.compare("cuda") == 0) { dev_type = mgard_x::device_type::CUDA; std::cout << mgard_x::log::log_info << "device type: CUDA\n"; } else if (dev.compare("hip") == 0) { dev_type = mgard_x::device_type::HIP; std::cout << mgard_x::log::log_info << "device type: HIP\n"; + } else if (dev.compare("sycl") == 0) { + dev_type = mgard_x::device_type::SYCL; + std::cout << mgard_x::log::log_info << "device type: SYCL\n"; } else { std::cout << "wrong device type.\n"; exit(-1); @@ -219,14 +229,17 @@ int main(int argc, char *argv[]) { } else { std::cout << mgard_x::log::log_info << "Full automatic mode\n"; std::vector shape({513, 513, 513}); -#ifdef MGARD_ENABLE_SERIAL - autotuning(mgard_x::device_type::Serial, shape); +#if MGARD_ENABLE_SERIAL + autotuning(mgard_x::device_type::SERIAL, shape); #endif -#ifdef MGARD_ENABLE_CUDA +#if MGARD_ENABLE_CUDA autotuning(mgard_x::device_type::CUDA, shape); #endif -#ifdef MGARD_ENABLE_HIP +#if MGARD_ENABLE_HIP autotuning(mgard_x::device_type::HIP, shape); +#endif +#if MGARD_ENABLE_SYCL + autotuning(mgard_x::device_type::SYCL, shape); #endif } return 0; diff --git a/src/mgard-x/Executables/mgard-x.cpp b/src/mgard-x/Executables/mgard-x.cpp index 195822a468..eddea57f20 100644 --- a/src/mgard-x/Executables/mgard-x.cpp +++ b/src/mgard-x/Executables/mgard-x.cpp @@ -471,17 +471,20 @@ bool try_compression(int argc, char *argv[]) { enum mgard_x::device_type dev_type; std::string dev = get_arg(argc, argv, "-d"); if (dev.compare("auto") == 0) { - dev_type = mgard_x::device_type::Auto; - std::cout << mgard_x::log::log_info << "device type: Auto\n"; + dev_type = mgard_x::device_type::AUTO; + std::cout << mgard_x::log::log_info << "device type: AUTO\n"; } else if (dev.compare("serial") == 0) { - dev_type = mgard_x::device_type::Serial; - std::cout << mgard_x::log::log_info << "device type: Serial\n"; + dev_type = mgard_x::device_type::SERIAL; + std::cout << mgard_x::log::log_info << "device type: SERIAL\n"; } else if (dev.compare("cuda") == 0) { dev_type = mgard_x::device_type::CUDA; std::cout << mgard_x::log::log_info << "device type: CUDA\n"; } else if (dev.compare("hip") == 0) { dev_type = mgard_x::device_type::HIP; std::cout << mgard_x::log::log_info << "device type: HIP\n"; + } else if (dev.compare("sycl") == 0) { + dev_type = mgard_x::device_type::SYCL; + std::cout << mgard_x::log::log_info << "device type: SYCL\n"; } else { print_usage_message("wrong device type."); } @@ -517,17 +520,20 @@ bool try_decompression(int argc, char *argv[]) { enum mgard_x::device_type dev_type; std::string dev = get_arg(argc, argv, "-d"); if (dev.compare("auto") == 0) { - dev_type = mgard_x::device_type::Auto; - std::cout << mgard_x::log::log_info << "device type: Auto\n"; + dev_type = mgard_x::device_type::AUTO; + std::cout << mgard_x::log::log_info << "device type: AUTO\n"; } else if (dev.compare("serial") == 0) { - dev_type = mgard_x::device_type::Serial; - std::cout << mgard_x::log::log_info << "device type: Serial\n"; + dev_type = mgard_x::device_type::SERIAL; + std::cout << mgard_x::log::log_info << "device type: SERIAL\n"; } else if (dev.compare("cuda") == 0) { dev_type = mgard_x::device_type::CUDA; std::cout << mgard_x::log::log_info << "device type: CUDA\n"; } else if (dev.compare("hip") == 0) { dev_type = mgard_x::device_type::HIP; std::cout << mgard_x::log::log_info << "device type: HIP\n"; + } else if (dev.compare("sycl") == 0) { + dev_type = mgard_x::device_type::HIP; + std::cout << mgard_x::log::log_info << "device type: SYCL\n"; } else { print_usage_message("wrong device type."); } diff --git a/src/mgard-x/HighLevelAPI/CMakeLists.txt b/src/mgard-x/HighLevelAPI/CMakeLists.txt deleted file mode 100644 index 5a55fee427..0000000000 --- a/src/mgard-x/HighLevelAPI/CMakeLists.txt +++ /dev/null @@ -1,18 +0,0 @@ -if(MGARD_ENABLE_SERIAL) - add_subdirectory (Serial) - set(MGARD_X_SERIAL_SRC ${MGARD_X_SERIAL_SRC} PARENT_SCOPE) -endif() -if(MGARD_ENABLE_CUDA) - add_subdirectory (CUDA) - set(MGARD_X_CUDA_SRC ${MGARD_X_CUDA_SRC} PARENT_SCOPE) -endif() -if(MGARD_ENABLE_HIP) - add_subdirectory (HIP) - set(MGARD_X_HIP_SRC ${MGARD_X_HIP_SRC} PARENT_SCOPE) -endif() - -list(APPEND MGARD_X_SRC - ${CMAKE_CURRENT_SOURCE_DIR}/DynamicAPI.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/Metadata.cpp) - -set(MGARD_X_SRC ${MGARD_X_SRC} PARENT_SCOPE) diff --git a/src/mgard-x/HighLevelAPI/CUDA/CMakeLists.txt b/src/mgard-x/HighLevelAPI/CUDA/CMakeLists.txt deleted file mode 100644 index b6c2bfe8ea..0000000000 --- a/src/mgard-x/HighLevelAPI/CUDA/CMakeLists.txt +++ /dev/null @@ -1,4 +0,0 @@ -list(APPEND MGARD_X_CUDA_SRC - ${CMAKE_CURRENT_SOURCE_DIR}/HighLevelAPI.cu) - -set(MGARD_X_CUDA_SRC ${MGARD_X_CUDA_SRC} PARENT_SCOPE) \ No newline at end of file diff --git a/src/mgard-x/HighLevelAPI/CUDA/HighLevelAPI.cu b/src/mgard-x/HighLevelAPI/CUDA/HighLevelAPI.cu deleted file mode 100644 index 6dae3be9a4..0000000000 --- a/src/mgard-x/HighLevelAPI/CUDA/HighLevelAPI.cu +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/HighLevelAPI.hpp" - -#include - -#include -namespace mgard_x { - -template void compress(DIM D, data_type dtype, std::vector shape, - double tol, double s, enum error_bound_type mode, - const void *original_data, void *&compressed_data, - size_t &compressed_size, Config config, - bool output_pre_allocated); - -template void compress(DIM D, data_type dtype, std::vector shape, - double tol, double s, enum error_bound_type mode, - const void *original_data, void *&compressed_data, - size_t &compressed_size, - bool output_pre_allocated); - -template void compress(DIM D, data_type dtype, std::vector shape, - double tol, double s, enum error_bound_type mode, - const void *original_data, void *&compressed_data, - size_t &compressed_size, - std::vector coords, Config config, - bool output_pre_allocated); - -template void compress(DIM D, data_type dtype, std::vector shape, - double tol, double s, enum error_bound_type mode, - const void *original_data, void *&compressed_data, - size_t &compressed_size, - std::vector coords, - bool output_pre_allocated); - -template void decompress(const void *compressed_data, - size_t compressed_size, void *&decompressed_data, - Config config, bool output_pre_allocated); - -template void decompress(const void *compressed_data, - size_t compressed_size, void *&decompressed_data, - bool output_pre_allocated); - -template void decompress(const void *compressed_data, - size_t compressed_size, void *&decompressed_data, - data_type &dtype, - std::vector &shape, Config config, - bool output_pre_allocated); - -template void decompress(const void *compressed_data, - size_t compressed_size, void *&decompressed_data, - data_type &dtype, - std::vector &shape, - bool output_pre_allocated); - -template void BeginAutoTuning(); -template void EndAutoTuning(); -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/HighLevelAPI/HIP/CMakeLists.txt b/src/mgard-x/HighLevelAPI/HIP/CMakeLists.txt deleted file mode 100644 index 553f8fdedb..0000000000 --- a/src/mgard-x/HighLevelAPI/HIP/CMakeLists.txt +++ /dev/null @@ -1,4 +0,0 @@ -list(APPEND MGARD_X_HIP_SRC - ${CMAKE_CURRENT_SOURCE_DIR}/HighLevelAPI.cpp) - -set(MGARD_X_HIP_SRC ${MGARD_X_HIP_SRC} PARENT_SCOPE) \ No newline at end of file diff --git a/src/mgard-x/HighLevelAPI/HIP/HighLevelAPI.cpp b/src/mgard-x/HighLevelAPI/HIP/HighLevelAPI.cpp deleted file mode 100644 index ba9672cd3b..0000000000 --- a/src/mgard-x/HighLevelAPI/HIP/HighLevelAPI.cpp +++ /dev/null @@ -1,63 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/HighLevelAPI.hpp" - -#include - -#include -namespace mgard_x { - -template void compress(DIM D, data_type dtype, std::vector shape, - double tol, double s, enum error_bound_type mode, - const void *original_data, void *&compressed_data, - size_t &compressed_size, Config config, - bool output_pre_allocated); - -template void compress(DIM D, data_type dtype, std::vector shape, - double tol, double s, enum error_bound_type mode, - const void *original_data, void *&compressed_data, - size_t &compressed_size, bool output_pre_allocated); - -template void compress(DIM D, data_type dtype, std::vector shape, - double tol, double s, enum error_bound_type mode, - const void *original_data, void *&compressed_data, - size_t &compressed_size, - std::vector coords, Config config, - bool output_pre_allocated); - -template void compress(DIM D, data_type dtype, std::vector shape, - double tol, double s, enum error_bound_type mode, - const void *original_data, void *&compressed_data, - size_t &compressed_size, - std::vector coords, - bool output_pre_allocated); - -template void decompress(const void *compressed_data, - size_t compressed_size, void *&decompressed_data, - Config config, bool output_pre_allocated); - -template void decompress(const void *compressed_data, - size_t compressed_size, void *&decompressed_data, - bool output_pre_allocated); - -template void decompress(const void *compressed_data, - size_t compressed_size, void *&decompressed_data, - data_type &dtype, - std::vector &shape, Config config, - bool output_pre_allocated); - -template void decompress(const void *compressed_data, - size_t compressed_size, void *&decompressed_data, - data_type &dtype, - std::vector &shape, - bool output_pre_allocated); - -template void BeginAutoTuning(); -template void EndAutoTuning(); - -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/HighLevelAPI/Metadata.cpp b/src/mgard-x/HighLevelAPI/Metadata.cpp deleted file mode 100644 index 64e292512e..0000000000 --- a/src/mgard-x/HighLevelAPI/Metadata.cpp +++ /dev/null @@ -1,122 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include -#include -#include -#include -#include - -// #include "compress_cuda.hpp" -#include "mgard-x/Hierarchy.h" -#include "mgard-x/Metadata.hpp" -#include "mgard-x/RuntimeX/RuntimeXPublic.h" - -namespace mgard_x { - -// bool verify(const void *compressed_data, size_t compressed_size) { -// if (compressed_size < SIGNATURE_SIZE) -// return false; -// Metadata meta; -// meta.Deserialize((SERIALIZED_TYPE *)compressed_data); -// for (size_t i = 0; i < SIGNATURE_SIZE; i++) { -// if (meta.signature[i] != meta.mgard_signature[i]) { -// return false; -// } -// } -// return true; -// } - -// enum data_type infer_data_type(const void *compressed_data, -// size_t compressed_size) { -// if (!verify(compressed_data, compressed_size)) { -// std::cout << log::log_err << "cannot verify the data!\n"; -// exit(-1); -// } -// Metadata meta; -// meta.Deserialize((SERIALIZED_TYPE *)compressed_data); -// return meta.dtype; -// } - -// std::vector infer_shape(const void *compressed_data, -// size_t compressed_size) { -// if (!verify(compressed_data, compressed_size)) { -// std::cout << log::log_err << "cannot verify the data!\n"; -// exit(-1); -// } - -// Metadata meta; -// meta.Deserialize((SERIALIZED_TYPE *)compressed_data); -// std::vector shape(meta.total_dims); -// for (DIM d = 0; d < meta.total_dims; d++) { -// shape[d] = (SIZE)meta.shape[d]; -// } -// return shape; -// } - -// enum data_structure_type infer_data_structure(const void *compressed_data, -// size_t compressed_size) { -// if (!verify(compressed_data, compressed_size)) { -// std::cout << log::log_err << "cannot verify the data!\n"; -// exit(-1); -// } -// Metadata meta; -// meta.Deserialize((SERIALIZED_TYPE *)compressed_data); -// return meta.dstype; -// } - -// template -// std::vector infer_coords(const void *compressed_data, -// size_t compressed_size) { -// if (!verify(compressed_data, compressed_size)) { -// std::cout << log::log_err << "cannot verify the data!\n"; -// exit(-1); -// } -// Metadata meta; -// meta.Deserialize((SERIALIZED_TYPE *)compressed_data); -// std::vector shape(meta.total_dims); -// for (DIM d = 0; d < meta.total_dims; d++) { -// shape[d] = (SIZE)meta.shape[d]; -// } -// std::vector coords(meta.total_dims); -// for (DIM d = 0; d < meta.total_dims; d++) { -// coords[d] = (T *)std::malloc(shape[d] * sizeof(T)); -// for (SIZE i = 0; i < shape[d]; i++) { -// coords[d][i] = (T)meta.coords[d][i]; -// } -// } -// return coords; -// } - -// template std::vector infer_coords(const void *compressed_data, -// size_t compressed_size); -// template std::vector infer_coords(const void *compressed_data, -// size_t compressed_size); - -// std::string infer_nonuniform_coords_file(const void *compressed_data, -// size_t compressed_size) { -// if (!verify(compressed_data, compressed_size)) { -// std::cout << log::log_err << "cannot verify the data!\n"; -// exit(-1); -// } -// Metadata meta; -// meta.Deserialize((SERIALIZED_TYPE *)compressed_data); -// return std::string(meta.nonuniform_coords_file); -// } - -// bool infer_domain_decomposed(const void *compressed_data, -// size_t compressed_size) { -// if (!verify(compressed_data, compressed_size)) { -// std::cout << log::log_err << "cannot verify the data!\n"; -// exit(-1); -// } -// Metadata meta; -// meta.Deserialize((SERIALIZED_TYPE *)compressed_data); -// return meta.domain_decomposed; -// } - -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/HighLevelAPI/Serial/CMakeLists.txt b/src/mgard-x/HighLevelAPI/Serial/CMakeLists.txt deleted file mode 100644 index ffa7bbf66f..0000000000 --- a/src/mgard-x/HighLevelAPI/Serial/CMakeLists.txt +++ /dev/null @@ -1,4 +0,0 @@ -list(APPEND MGARD_X_SERIAL_SRC - ${CMAKE_CURRENT_SOURCE_DIR}/HighLevelAPI.cpp) - -set(MGARD_X_SERIAL_SRC ${MGARD_X_SERIAL_SRC} PARENT_SCOPE) \ No newline at end of file diff --git a/src/mgard-x/HighLevelAPI/Serial/HighLevelAPI.cpp b/src/mgard-x/HighLevelAPI/Serial/HighLevelAPI.cpp deleted file mode 100644 index dd62370aba..0000000000 --- a/src/mgard-x/HighLevelAPI/Serial/HighLevelAPI.cpp +++ /dev/null @@ -1,65 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/HighLevelAPI.hpp" - -#include - -#include -namespace mgard_x { - -template void compress(DIM D, data_type dtype, std::vector shape, - double tol, double s, enum error_bound_type mode, - const void *original_data, - void *&compressed_data, size_t &compressed_size, - Config config, bool output_pre_allocated); - -template void compress(DIM D, data_type dtype, std::vector shape, - double tol, double s, enum error_bound_type mode, - const void *original_data, - void *&compressed_data, size_t &compressed_size, - bool output_pre_allocated); - -template void compress(DIM D, data_type dtype, std::vector shape, - double tol, double s, enum error_bound_type mode, - const void *original_data, - void *&compressed_data, size_t &compressed_size, - std::vector coords, Config config, - bool output_pre_allocated); - -template void compress(DIM D, data_type dtype, std::vector shape, - double tol, double s, enum error_bound_type mode, - const void *original_data, - void *&compressed_data, size_t &compressed_size, - std::vector coords, - bool output_pre_allocated); - -template void decompress(const void *compressed_data, - size_t compressed_size, - void *&decompressed_data, Config config, - bool output_pre_allocated); - -template void decompress(const void *compressed_data, - size_t compressed_size, - void *&decompressed_data, - bool output_pre_allocated); - -template void decompress(const void *compressed_data, - size_t compressed_size, - void *&decompressed_data, data_type &dtype, - std::vector &shape, - Config config, bool output_pre_allocated); - -template void decompress(const void *compressed_data, - size_t compressed_size, - void *&decompressed_data, data_type &dtype, - std::vector &shape, - bool output_pre_allocated); - -template void BeginAutoTuning(); -template void EndAutoTuning(); -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/LosslessCompression.cu b/src/mgard-x/LosslessCompression.cu deleted file mode 100644 index d9048fca54..0000000000 --- a/src/mgard-x/LosslessCompression.cu +++ /dev/null @@ -1,469 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "compressors.hpp" -#include "cuda/Common.h" -#include "cuda/CommonInternal.h" -#include "cuda/LosslessCompression.h" -#include "cuda/ParallelHuffman/huffman_workflow.cuh" -// #include "cuda/ParallelHuffman/Huffman.hpp" - -#include - -namespace mgard_x { - -template -void cascaded_compress(Handle &handle, C *input_data, size_t intput_count, - void *&output_data, size_t &output_size, int n_rle, - int n_de, bool bitpack, int queue_idx) { - - nvcomp::CascadedCompressor compressor(nvcomp::TypeOf(), n_rle, n_de, - bitpack); - - size_t *temp_bytes; - cudaMallocHostHelper((void **)&temp_bytes, sizeof(size_t)); - size_t *output_bytes; - cudaMallocHostHelper((void **)&output_bytes, sizeof(size_t)); - - compressor.configure(intput_count * sizeof(C), temp_bytes, output_bytes); - - void *temp_space; - cudaMallocHelper(handle, &temp_space, *temp_bytes); - cudaMallocHelper(handle, &output_data, *output_bytes); - - compressor.compress_async(input_data, intput_count * sizeof(C), temp_space, - *temp_bytes, output_data, output_bytes, - *(cudaStream_t *)handle.get(queue_idx)); - handle.sync(queue_idx); - output_size = *output_bytes; - cudaFreeHelper(temp_space); - cudaFreeHostHelper(temp_bytes); - cudaFreeHostHelper(output_bytes); -} - -template -void cascaded_decompress(Handle &handle, void *input_data, - size_t input_size, C *&output_data, int queue_idx) { - - // nvcomp::Decompressor decompressor(input_data, input_size, - // *(cudaStream_t - // *)handle.get(queue_idx)); - - nvcomp::CascadedDecompressor decompressor; - - size_t *temp_bytes; - cudaMallocHostHelper((void **)&temp_bytes, sizeof(size_t)); - size_t *output_bytes; - cudaMallocHostHelper((void **)&output_bytes, sizeof(size_t)); - - decompressor.configure(input_data, input_size, temp_bytes, output_bytes, - *(cudaStream_t *)handle.get(queue_idx)); - - void *temp_space; - cudaMallocHelper(handle, (void **)&temp_space, *temp_bytes); - cudaMallocHelper(handle, (void **)&output_data, *output_bytes); - - decompressor.decompress_async(input_data, input_size, temp_space, *temp_bytes, - output_data, *output_bytes, - *(cudaStream_t *)handle.get(queue_idx)); - handle.sync(queue_idx); - cudaFreeHelper(temp_space); - cudaFreeHostHelper(temp_bytes); - cudaFreeHostHelper(output_bytes); -} - -template -void lz4_compress(Handle &handle, C *input_data, size_t input_count, - void *&output_data, size_t &output_size, size_t chunk_size, - int queue_idx) { - nvcompType_t dtype = NVCOMP_TYPE_UCHAR; - nvcomp::LZ4Compressor compressor(chunk_size, dtype); - - size_t *temp_bytes; - cudaMallocHostHelper((void **)&temp_bytes, sizeof(size_t)); - size_t *output_bytes; - cudaMallocHostHelper((void **)&output_bytes, sizeof(size_t)); - - compressor.configure(input_count * sizeof(C), temp_bytes, output_bytes); - - void *temp_space; - cudaMallocHelper(handle, &temp_space, *temp_bytes); - cudaMallocHelper(handle, &output_data, *output_bytes); - - compressor.compress_async(input_data, input_count * sizeof(C), temp_space, - *temp_bytes, output_data, output_bytes, - *(cudaStream_t *)handle.get(queue_idx)); - - handle.sync(queue_idx); - output_size = *output_bytes; - cudaFreeHelper(temp_space); - cudaFreeHostHelper(temp_bytes); - cudaFreeHostHelper(output_bytes); -} - -template -void lz4_decompress(Handle &handle, void *input_data, size_t input_size, - C *&output_data, size_t &output_size, int queue_idx) { - - nvcomp::LZ4Decompressor decompressor; - - size_t *temp_bytes; - cudaMallocHostHelper((void **)&temp_bytes, sizeof(size_t)); - size_t *output_bytes; - cudaMallocHostHelper((void **)&output_bytes, sizeof(size_t)); - - decompressor.configure(input_data, input_size, temp_bytes, output_bytes, - *(cudaStream_t *)handle.get(queue_idx)); - - void *temp_space; - cudaMallocHelper(handle, (void **)&temp_space, *temp_bytes); - cudaMallocHelper(handle, (void **)&output_data, *output_bytes); - - decompressor.decompress_async(input_data, input_size, temp_space, *temp_bytes, - output_data, *output_bytes, - *(cudaStream_t *)handle.get(queue_idx)); - handle.sync(queue_idx); - output_size = *output_bytes; - cudaFreeHelper(temp_space); - cudaFreeHostHelper(temp_bytes); - cudaFreeHostHelper(output_bytes); -} - -#define KERNELS(D, T, C) \ - template void cascaded_compress( \ - Handle & handle, C * input_data, size_t intput_count, \ - void *&output_data, size_t &output_size, int n_rle, int n_de, \ - bool bitpack, int queue_idx); \ - template void cascaded_decompress( \ - Handle & handle, void *input_data, size_t input_size, \ - C *&output_data, int queue_idx); \ - template void lz4_compress(Handle & handle, C * input_data, \ - size_t input_count, void *&output_data, \ - size_t &output_size, size_t chunk_size, \ - int queue_idx); \ - template void lz4_decompress( \ - Handle & handle, void *input_data, size_t input_size, \ - C *&output_data, size_t &output_count, int queue_idx); - -KERNELS(1, double, uint8_t) -KERNELS(1, float, uint8_t) -KERNELS(2, double, uint8_t) -KERNELS(2, float, uint8_t) -KERNELS(3, double, uint8_t) -KERNELS(3, float, uint8_t) -KERNELS(4, double, uint8_t) -KERNELS(4, float, uint8_t) -KERNELS(5, double, uint8_t) -KERNELS(5, float, uint8_t) -KERNELS(1, double, uint32_t) -KERNELS(1, float, uint32_t) -KERNELS(2, double, uint32_t) -KERNELS(2, float, uint32_t) -KERNELS(3, double, uint32_t) -KERNELS(3, float, uint32_t) -KERNELS(4, double, uint32_t) -KERNELS(4, float, uint32_t) -KERNELS(5, double, uint32_t) -KERNELS(5, float, uint32_t) -KERNELS(1, double, uint64_t) -KERNELS(1, float, uint64_t) -KERNELS(2, double, uint64_t) -KERNELS(2, float, uint64_t) -KERNELS(3, double, uint64_t) -KERNELS(3, float, uint64_t) -KERNELS(4, double, uint64_t) -KERNELS(4, float, uint64_t) -KERNELS(5, double, uint64_t) -KERNELS(5, float, uint64_t) -#undef KERNELS - -template -void SeparateOutlierAndPrimary(Handle &handle, S *dqv, size_t n, - size_t *outlier_idx, size_t outlier_count, - size_t primary_count, S *doutlier, Q *dprimary, - int queue_idx) { - - // printf("compress outlier_idx: "); for(int i = 0; i < outlier_count; i++) - // {printf("%llu ", outlier_idx[i]);} printf("\n"); - printf("compress outlier_count: %llu\n", outlier_count); - printf("compress primary_count: %llu\n", primary_count); - printf("start separating primary and outlier\n"); - - size_t p = 0; - size_t pp = 0; - size_t op = 0; - size_t size = outlier_idx[0] - 0; - // printf("copy primary\n"); - if (size > 0) { - mgard_x::cudaMemcpyAsyncHelper(handle, dprimary + pp, dqv + p, - size * sizeof(Q), mgard_x::D2D, queue_idx); - } - pp += size; - p += size; - - for (int i = 0; i < outlier_count - 1; i++) { - size = 1; - // printf("copy outlier\n"); - mgard_x::cudaMemcpyAsyncHelper(handle, doutlier + op, dqv + p, - size * sizeof(S), mgard_x::D2D, queue_idx); - op += size; - p += size; - size = outlier_idx[i + 1] - outlier_idx[i] - 1; - // printf("copy primary %d %d %d\n", p, size, outlier_idx[outlier_idx.size() - // - 1]); - if (size > 0) { - mgard_x::cudaMemcpyAsyncHelper(handle, dprimary + pp, dqv + p, - size * sizeof(Q), mgard_x::D2D, queue_idx); - } - pp += size; - p += size; - } - size = 1; - // printf("copy outlier\n"); - mgard_x::cudaMemcpyAsyncHelper(handle, doutlier + op, dqv + p, - size * sizeof(S), mgard_x::D2D, queue_idx); - op += size; - p += size; - size = n - outlier_idx[outlier_count - 1] - 1; - // printf("copy primary %d %d %d\n", p, size, outlier_idx[outlier_idx.size() - - // 1]); - if (size > 0) { - mgard_x::cudaMemcpyAsyncHelper(handle, dprimary + pp, dqv + p, - size * sizeof(Q), mgard_x::D2D, queue_idx); - } - // printf("done copy primary\n"); - pp += size; - p += size; - - if (pp != primary_count || op != outlier_count) { - printf("Primary or outlier size mismatch!\n"); - } - printf("done separating primary and outlier\n"); -} - -template -void CombineOutlierAndPrimary(Handle &handle, S *dqv, size_t n, - size_t *outlier_idx, size_t outlier_count, - size_t primary_count, S *doutlier, Q *dprimary, - int queue_idx) { - size_t p = 0; - size_t pp = 0; - size_t op = 0; - size_t size = outlier_idx[0] - 0; - // printf("copy primary\n"); - if (size > 0) { - mgard_x::cudaMemcpyAsyncHelper(handle, dqv + p, dprimary + pp, - size * sizeof(Q), mgard_x::D2D, queue_idx); - } - pp += size; - p += size; - - for (int i = 0; i < outlier_count - 1; i++) { - size = 1; - // printf("copy outlier\n"); - mgard_x::cudaMemcpyAsyncHelper(handle, dqv + p, doutlier + op, - size * sizeof(S), mgard_x::D2D, queue_idx); - op += size; - p += size; - size = outlier_idx[i + 1] - outlier_idx[i] - 1; - // printf("copy primary %d %d %d\n", p, size, outlier_idx[outlier_idx.size() - // - 1]); - if (size > 0) { - mgard_x::cudaMemcpyAsyncHelper(handle, dqv + p, dprimary + pp, - size * sizeof(Q), mgard_x::D2D, queue_idx); - } - pp += size; - p += size; - } - size = 1; - // printf("copy outlier\n"); - mgard_x::cudaMemcpyAsyncHelper(handle, dqv + p, doutlier + op, - size * sizeof(S), mgard_x::D2D, queue_idx); - op += size; - p += size; - size = n - outlier_idx[outlier_count - 1] - 1; - // printf("copy primary %d %d %d\n", p, size, outlier_idx[outlier_idx.size() - - // 1]); - if (size > 0) { - mgard_x::cudaMemcpyAsyncHelper(handle, dqv + p, dprimary + pp, - size * sizeof(Q), mgard_x::D2D, queue_idx); - } - // printf("done copy primary\n"); - pp += size; - p += size; -} - -#define KERNELS(D, T, S, Q) \ - template void SeparateOutlierAndPrimary( \ - Handle & handle, S * dqv, size_t n, size_t * outlier_idx,\ - size_t outlier_count, \ - size_t primary_count,\ - S * doutlier, \ - Q * dprimary, int queue_idx); \ - template void CombineOutlierAndPrimary( \ - Handle & handle, S * dqv, size_t n, size_t * outlier_idx,\ - size_t outlier_count, \ - size_t primary_count,\ - S * doutlier, \ - Q * dprimary, int queue_idx); - -KERNELS(1, double, int, uint32_t) -KERNELS(1, float, int, uint32_t) -KERNELS(2, double, int, uint32_t) -KERNELS(2, float, int, uint32_t) -KERNELS(3, double, int, uint32_t) -KERNELS(3, float, int, uint32_t) -KERNELS(4, double, int, uint32_t) -KERNELS(4, float, int, uint32_t) -KERNELS(5, double, int, uint32_t) -KERNELS(5, float, int, uint32_t) -#undef KERNELS - -template -void huffman_compress(Handle &handle, S *input_data, size_t input_count, - std::vector &outlier_idx, H *&out_meta, - size_t &out_meta_size, H *&out_data, - size_t &out_data_size, int chunk_size, int dict_size, - int queue_idx) { - - // HuffmanEncode(handle, input_data, input_count, - // outlier_idx, - // out_meta, out_meta_size, out_data, - // out_data_size, chunk_size, dict_size); -} - -template -void huffman_decompress(Handle &handle, H *in_meta, size_t in_meta_size, - H *in_data, size_t in_data_size, S *&output_data, - size_t &output_count, int queue_idx) { - // HuffmanDecode(handle, output_data, output_count, - // in_meta, - // in_meta_size, in_data, in_data_size); -} - -#define KERNELS(D, T, S, Q, H) \ - template void huffman_compress( \ - Handle & handle, S * input_data, size_t input_count, \ - std::vector & outlier_idx, H * &out_meta, \ - size_t & out_meta_size, H * &out_data, size_t & out_data_size, \ - int chunk_size, int dict_size, int queue_idx); \ - template void huffman_decompress( \ - Handle & handle, H * in_meta, size_t in_meta_size, H * in_data, \ - size_t in_data_size, S * &output_data, size_t & output_count, \ - int queue_idx); - -KERNELS(1, double, int, uint32_t, uint32_t) -KERNELS(1, float, int, uint32_t, uint32_t) -KERNELS(2, double, int, uint32_t, uint32_t) -KERNELS(2, float, int, uint32_t, uint32_t) -KERNELS(3, double, int, uint32_t, uint32_t) -KERNELS(3, float, int, uint32_t, uint32_t) -KERNELS(4, double, int, uint32_t, uint32_t) -KERNELS(4, float, int, uint32_t, uint32_t) -KERNELS(5, double, int, uint32_t, uint32_t) -KERNELS(5, float, int, uint32_t, uint32_t) -KERNELS(1, double, int, uint32_t, uint64_t) -KERNELS(1, float, int, uint32_t, uint64_t) -KERNELS(2, double, int, uint32_t, uint64_t) -KERNELS(2, float, int, uint32_t, uint64_t) -KERNELS(3, double, int, uint32_t, uint64_t) -KERNELS(3, float, int, uint32_t, uint64_t) -KERNELS(4, double, int, uint32_t, uint64_t) -KERNELS(4, float, int, uint32_t, uint64_t) -KERNELS(5, double, int, uint32_t, uint64_t) -KERNELS(5, float, int, uint32_t, uint64_t) - -template -void cpu_lossless_compression(Handle &handle, S *input_data, - size_t input_count, H *&out_data, - size_t &out_data_size) { - - int *int_vector = new int[input_count]; - - cudaMemcpyAsyncHelper(handle, int_vector, input_data, input_count * sizeof(S), - AUTO, 0); - handle.sync(0); - - std::vector input_vector(input_count); - for (int i = 0; i < input_count; i++) - input_vector[i] = int_vector[i]; - - // printf("%u %u\n", sizeof(long int), sizeof(int)); - // printf("dqv\n"); - // print_matrix_cuda(1, input_count, input_data, input_count); - - // printf("input_vector: "); - // for (int i = 0; i < input_vector.size(); i++) printf("%d ", - // input_vector[i]); printf("\n"); Compress an array of data using `zstd`. - std::size_t zstd_outsize; - - void *const buffer = - mgard::compress_memory_huffman(input_vector, zstd_outsize); - - out_data_size = zstd_outsize; - - cudaMallocHelper(handle, (void **)&out_data, out_data_size); - cudaMemcpyAsyncHelper(handle, out_data, buffer, out_data_size, AUTO, 0); - handle.sync(0); - delete[] int_vector; -} - -template -void cpu_lossless_decompression(Handle &handle, H *input_data, - size_t input_count, S *&out_data, - size_t output_count) { - - // printf("cpu decompression: %llu\n", input_count); - std::vector input_vector(input_count); - cudaMemcpyAsyncHelper(handle, input_vector.data(), input_data, input_count, - AUTO, 0); - handle.sync(0); - // printf("copy done\n"); - - long int *output_vector = new long int[output_count]; - int *int_vector = new int[output_count]; - - mgard::decompress_memory_huffman( - reinterpret_cast(input_vector.data()), - input_vector.size(), output_vector, - output_count * sizeof(*output_vector)); - - for (int i = 0; i < output_count; i++) - int_vector[i] = output_vector[i]; - cudaMallocHelper(handle, (void **)&out_data, output_count * sizeof(S)); - cudaMemcpyAsyncHelper(handle, out_data, int_vector, output_count * sizeof(S), - AUTO, 0); - handle.sync(0); - delete[] output_vector; - delete[] int_vector; - - // printf("dqv\n"); - // print_matrix_cuda(1, output_count, out_data, output_count); -} - -#define KERNELS(D, T, S, H) \ - template void cpu_lossless_compression( \ - Handle & handle, S * input_data, size_t input_count, \ - H * &out_data, size_t & out_data_size); \ - template void cpu_lossless_decompression( \ - Handle & handle, H * input_data, size_t input_count, \ - S * &out_data, size_t output_count); - -KERNELS(1, double, int, unsigned char) -KERNELS(1, float, int, unsigned char) -KERNELS(2, double, int, unsigned char) -KERNELS(2, float, int, unsigned char) -KERNELS(3, double, int, unsigned char) -KERNELS(3, float, int, unsigned char) -KERNELS(4, double, int, unsigned char) -KERNELS(4, float, int, unsigned char) -KERNELS(5, double, int, unsigned char) -KERNELS(5, float, int, unsigned char) - -} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/RuntimeX/AutoTuners/AutoTunerCuda.cu b/src/mgard-x/RuntimeX/AutoTuners/AutoTunerCuda.cu index 9bb60b6038..363f5c8a7b 100644 --- a/src/mgard-x/RuntimeX/AutoTuners/AutoTunerCuda.cu +++ b/src/mgard-x/RuntimeX/AutoTuners/AutoTunerCuda.cu @@ -62,6 +62,9 @@ int AutoTuningTable::lwdqzk[2][9] = {{0, 0, 0, 0, 0, 0, 0, 0, 0}, int AutoTuningTable::llk[2][9] = {{0, 0, 0, 0, 0, 0, 0, 0, 0}, {0, 0, 0, 0, 0, 0, 0, 0, 0}}; +template void BeginAutoTuning(); +template void EndAutoTuning(); + } // namespace mgard_x // clang-format on #undef MGARDX_COMPILE_CUDA \ No newline at end of file diff --git a/src/mgard-x/RuntimeX/AutoTuners/AutoTunerHip.cpp b/src/mgard-x/RuntimeX/AutoTuners/AutoTunerHip.cpp index 3a91e76f7d..9c7254c5c5 100644 --- a/src/mgard-x/RuntimeX/AutoTuners/AutoTunerHip.cpp +++ b/src/mgard-x/RuntimeX/AutoTuners/AutoTunerHip.cpp @@ -61,6 +61,10 @@ int AutoTuningTable::lwdqzk[2][9] = {{5, 2, 2, 1, 0, 2, 1, 0, 0}, int AutoTuningTable::llk[2][9] = {{5, 2, 2, 1, 0, 2, 1, 0, 0}, {2, 2, 3, 3, 0, 2, 5, 0, 0}}; + +template void BeginAutoTuning(); +template void EndAutoTuning(); + } // namespace mgard_x // clang-format on #undef MGARDX_COMPILE_HIP \ No newline at end of file diff --git a/src/mgard-x/RuntimeX/AutoTuners/AutoTunerSerial.cpp b/src/mgard-x/RuntimeX/AutoTuners/AutoTunerSerial.cpp index f735ef4e2e..58fc4245e6 100644 --- a/src/mgard-x/RuntimeX/AutoTuners/AutoTunerSerial.cpp +++ b/src/mgard-x/RuntimeX/AutoTuners/AutoTunerSerial.cpp @@ -2,65 +2,69 @@ // clang-format off namespace mgard_x { -int AutoTuningTable::gpk_reo_3d[2][9] = {{5, 5, 5, 3, 3, 5, 5, 0, 0}, +int AutoTuningTable::gpk_reo_3d[2][9] = {{5, 5, 5, 3, 3, 5, 5, 0, 0}, {3, 6, 5, 3, 3, 3, 5, 0, 0}}; -int AutoTuningTable::gpk_rev_3d[2][9] = {{2, 4, 5, 5, 3, 5, 5, 0, 0}, +int AutoTuningTable::gpk_rev_3d[2][9] = {{2, 4, 5, 5, 3, 5, 5, 0, 0}, {3, 6, 6, 5, 3, 5, 6, 0, 0}}; -int AutoTuningTable::gpk_reo_nd[2][9] = {{0, 0, 3, 4, 3, 0, 0, 0, 0}, +int AutoTuningTable::gpk_reo_nd[2][9] = {{0, 0, 3, 4, 3, 0, 0, 0, 0}, {0, 0, 3, 4, 5, 0, 0, 0, 0}}; -int AutoTuningTable::gpk_rev_nd[2][9] = {{0, 0, 3, 4, 3, 0, 0, 0, 0}, +int AutoTuningTable::gpk_rev_nd[2][9] = {{0, 0, 3, 4, 3, 0, 0, 0, 0}, {0, 0, 3, 4, 5, 0, 0, 0, 0}}; -int AutoTuningTable::lpk1_3d[2][9] = {{4, 4, 1, 1, 1, 1, 1, 0, 0}, +int AutoTuningTable::lpk1_3d[2][9] = {{4, 4, 1, 1, 1, 1, 1, 0, 0}, {1, 1, 1, 1, 1, 1, 1, 0, 0}}; -int AutoTuningTable::lpk2_3d[2][9] = {{5, 4, 4, 4, 3, 3, 4, 0, 0}, +int AutoTuningTable::lpk2_3d[2][9] = {{5, 4, 4, 4, 3, 3, 4, 0, 0}, {4, 1, 1, 1, 1, 1, 3, 0, 0}}; -int AutoTuningTable::lpk3_3d[2][9] = {{4, 4, 3, 3, 2, 3, 4, 0, 0}, +int AutoTuningTable::lpk3_3d[2][9] = {{4, 4, 3, 3, 2, 3, 4, 0, 0}, {1, 1, 1, 1, 1, 1, 2, 0, 0}}; -int AutoTuningTable::lpk1_nd[2][9] = {{2, 0, 1, 1, 1, 0, 0, 0, 0}, +int AutoTuningTable::lpk1_nd[2][9] = {{2, 0, 1, 1, 1, 0, 0, 0, 0}, {0, 0, 1, 1, 1, 0, 0, 0, 0}}; -int AutoTuningTable::lpk2_nd[2][9] = {{2, 1, 3, 1, 0, 0, 0, 0, 0}, +int AutoTuningTable::lpk2_nd[2][9] = {{2, 1, 3, 1, 0, 0, 0, 0, 0}, {0, 2, 1, 1, 0, 0, 0, 0, 0}}; -int AutoTuningTable::lpk3_nd[2][9] = {{2, 3, 1, 1, 0, 0, 0, 0, 0}, +int AutoTuningTable::lpk3_nd[2][9] = {{2, 3, 1, 1, 0, 0, 0, 0, 0}, {0, 2, 1, 1, 0, 0, 0, 0, 0}}; -int AutoTuningTable::ipk1_3d[2][9] = {{3, 3, 4, 5, 5, 3, 4, 0, 0}, +int AutoTuningTable::ipk1_3d[2][9] = {{3, 3, 4, 5, 5, 3, 4, 0, 0}, {3, 6, 4, 4, 3, 3, 3, 0, 0}}; -int AutoTuningTable::ipk2_3d[2][9] = {{3, 3, 2, 2, 2, 2, 6, 0, 0}, +int AutoTuningTable::ipk2_3d[2][9] = {{3, 3, 2, 2, 2, 2, 6, 0, 0}, {2, 2, 2, 2, 2, 2, 5, 0, 0}}; -int AutoTuningTable::ipk3_3d[2][9] = {{3, 3, 2, 2, 2, 2, 1, 0, 0}, +int AutoTuningTable::ipk3_3d[2][9] = {{3, 3, 2, 2, 2, 2, 1, 0, 0}, {2, 2, 2, 2, 2, 2, 6, 0, 0}}; -int AutoTuningTable::ipk1_nd[2][9] = {{0, 2, 3, 3, 0, 0, 0, 0, 0}, +int AutoTuningTable::ipk1_nd[2][9] = {{0, 2, 3, 3, 0, 0, 0, 0, 0}, {0, 3, 3, 3, 0, 0, 0, 0, 0}}; -int AutoTuningTable::ipk2_nd[2][9] = {{0, 1, 2, 2, 0, 0, 0, 0, 0}, +int AutoTuningTable::ipk2_nd[2][9] = {{0, 1, 2, 2, 0, 0, 0, 0, 0}, {0, 2, 2, 2, 0, 0, 0, 0, 0}}; -int AutoTuningTable::ipk3_nd[2][9] = {{0, 2, 3, 2, 0, 0, 0, 0, 0}, +int AutoTuningTable::ipk3_nd[2][9] = {{0, 2, 3, 2, 0, 0, 0, 0, 0}, {0, 3, 4, 2, 0, 0, 0, 0, 0}}; -int AutoTuningTable::lwpk[2][9] = {{5, 2, 2, 1, 0, 2, 1, 0, 0}, +int AutoTuningTable::lwpk[2][9] = {{5, 2, 2, 1, 0, 2, 1, 0, 0}, {2, 2, 3, 3, 0, 2, 5, 0, 0}}; -int AutoTuningTable::lwqzk[2][9] = {{5, 2, 2, 1, 0, 2, 1, 0, 0}, +int AutoTuningTable::lwqzk[2][9] = {{5, 2, 2, 1, 0, 2, 1, 0, 0}, {2, 2, 3, 3, 0, 2, 5, 0, 0}}; -int AutoTuningTable::lwdqzk[2][9] = {{5, 2, 2, 1, 0, 2, 1, 0, 0}, +int AutoTuningTable::lwdqzk[2][9] = {{5, 2, 2, 1, 0, 2, 1, 0, 0}, {2, 2, 3, 3, 0, 2, 5, 0, 0}}; -int AutoTuningTable::llk[2][9] = {{5, 2, 2, 1, 0, 2, 1, 0, 0}, +int AutoTuningTable::llk[2][9] = {{5, 2, 2, 1, 0, 2, 1, 0, 0}, {2, 2, 3, 3, 0, 2, 5, 0, 0}}; + +template void BeginAutoTuning(); +template void EndAutoTuning(); + } // namespace mgard_x // clang-format on #undef MGARDX_COMPILE_SERIAL \ No newline at end of file diff --git a/src/mgard-x/RuntimeX/AutoTuners/AutoTunerSycl.cpp b/src/mgard-x/RuntimeX/AutoTuners/AutoTunerSycl.cpp new file mode 100644 index 0000000000..4f3abaf39d --- /dev/null +++ b/src/mgard-x/RuntimeX/AutoTuners/AutoTunerSycl.cpp @@ -0,0 +1,70 @@ +#include "mgard-x/RuntimeX/RuntimeX.h" +// clang-format off +namespace mgard_x { + +int AutoTuningTable::gpk_reo_3d[2][9] = {{5, 5, 5, 3, 3, 5, 5, 0, 0}, + {3, 6, 5, 3, 3, 3, 5, 0, 0}}; + +int AutoTuningTable::gpk_rev_3d[2][9] = {{2, 4, 5, 5, 3, 5, 5, 0, 0}, + {3, 6, 6, 5, 3, 5, 6, 0, 0}}; + +int AutoTuningTable::gpk_reo_nd[2][9] = {{0, 0, 3, 4, 3, 0, 0, 0, 0}, + {0, 0, 3, 4, 5, 0, 0, 0, 0}}; + +int AutoTuningTable::gpk_rev_nd[2][9] = {{0, 0, 3, 4, 3, 0, 0, 0, 0}, + {0, 0, 3, 4, 5, 0, 0, 0, 0}}; + +int AutoTuningTable::lpk1_3d[2][9] = {{4, 4, 1, 1, 1, 1, 1, 0, 0}, + {1, 1, 1, 1, 1, 1, 1, 0, 0}}; + +int AutoTuningTable::lpk2_3d[2][9] = {{5, 4, 4, 4, 3, 3, 4, 0, 0}, + {4, 1, 1, 1, 1, 1, 3, 0, 0}}; + +int AutoTuningTable::lpk3_3d[2][9] = {{4, 4, 3, 3, 2, 3, 4, 0, 0}, + {1, 1, 1, 1, 1, 1, 2, 0, 0}}; + +int AutoTuningTable::lpk1_nd[2][9] = {{2, 0, 1, 1, 1, 0, 0, 0, 0}, + {0, 0, 1, 1, 1, 0, 0, 0, 0}}; + +int AutoTuningTable::lpk2_nd[2][9] = {{2, 1, 3, 1, 0, 0, 0, 0, 0}, + {0, 2, 1, 1, 0, 0, 0, 0, 0}}; + +int AutoTuningTable::lpk3_nd[2][9] = {{2, 3, 1, 1, 0, 0, 0, 0, 0}, + {0, 2, 1, 1, 0, 0, 0, 0, 0}}; + +int AutoTuningTable::ipk1_3d[2][9] = {{3, 3, 4, 5, 5, 3, 4, 0, 0}, + {3, 6, 4, 4, 3, 3, 3, 0, 0}}; + +int AutoTuningTable::ipk2_3d[2][9] = {{3, 3, 2, 2, 2, 2, 6, 0, 0}, + {2, 2, 2, 2, 2, 2, 5, 0, 0}}; + +int AutoTuningTable::ipk3_3d[2][9] = {{3, 3, 2, 2, 2, 2, 1, 0, 0}, + {2, 2, 2, 2, 2, 2, 6, 0, 0}}; + +int AutoTuningTable::ipk1_nd[2][9] = {{0, 2, 3, 3, 0, 0, 0, 0, 0}, + {0, 3, 3, 3, 0, 0, 0, 0, 0}}; + +int AutoTuningTable::ipk2_nd[2][9] = {{0, 1, 2, 2, 0, 0, 0, 0, 0}, + {0, 2, 2, 2, 0, 0, 0, 0, 0}}; + +int AutoTuningTable::ipk3_nd[2][9] = {{0, 2, 3, 2, 0, 0, 0, 0, 0}, + {0, 3, 4, 2, 0, 0, 0, 0, 0}}; + +int AutoTuningTable::lwpk[2][9] = {{5, 2, 2, 1, 0, 2, 1, 0, 0}, + {2, 2, 3, 3, 0, 2, 5, 0, 0}}; + +int AutoTuningTable::lwqzk[2][9] = {{5, 2, 2, 1, 0, 2, 1, 0, 0}, + {2, 2, 3, 3, 0, 2, 5, 0, 0}}; + +int AutoTuningTable::lwdqzk[2][9] = {{5, 2, 2, 1, 0, 2, 1, 0, 0}, + {2, 2, 3, 3, 0, 2, 5, 0, 0}}; + +int AutoTuningTable::llk[2][9] = {{5, 2, 2, 1, 0, 2, 1, 0, 0}, + {2, 2, 3, 3, 0, 2, 5, 0, 0}}; + +template void BeginAutoTuning(); +template void EndAutoTuning(); + +} // namespace mgard_x +// clang-format on +#undef MGARDX_COMPILE_SYCL \ No newline at end of file diff --git a/src/mgard-x/RuntimeX/AutoTuners/CMakeLists.txt b/src/mgard-x/RuntimeX/AutoTuners/CMakeLists.txt index 88fad48f09..488a237a84 100644 --- a/src/mgard-x/RuntimeX/AutoTuners/CMakeLists.txt +++ b/src/mgard-x/RuntimeX/AutoTuners/CMakeLists.txt @@ -13,6 +13,11 @@ if(MGARD_ENABLE_HIP) ${CMAKE_CURRENT_SOURCE_DIR}/AutoTunerHip.cpp) set(MGARD_X_HIP_SRC ${MGARD_X_HIP_SRC} PARENT_SCOPE) endif() +if(MGARD_ENABLE_SYCL) + list(APPEND MGARD_X_SRC + ${CMAKE_CURRENT_SOURCE_DIR}/AutoTunerSycl.cpp) + set(MGARD_X_SYCL_SRC ${MGARD_X_SYCL_SRC} PARENT_SCOPE) +endif() if(MGARD_ENABLE_KOKKOS) list(APPEND MGARD_X_SRC ${CMAKE_CURRENT_SOURCE_DIR}/AutoTunerKokkos.cpp) diff --git a/src/mgard-x/RuntimeX/CMakeLists.txt b/src/mgard-x/RuntimeX/CMakeLists.txt index 0b8f543e8b..cc3b28c1b7 100644 --- a/src/mgard-x/RuntimeX/CMakeLists.txt +++ b/src/mgard-x/RuntimeX/CMakeLists.txt @@ -4,4 +4,5 @@ add_subdirectory (Utilities) set(MGARD_X_SERIAL_SRC ${MGARD_X_SERIAL_SRC} PARENT_SCOPE) set(MGARD_X_CUDA_SRC ${MGARD_X_CUDA_SRC} PARENT_SCOPE) set(MGARD_X_HIP_SRC ${MGARD_X_HIP_SRC} PARENT_SCOPE) +set(MGARD_X_SYCL_SRC ${MGARD_X_SYCL_SRC} PARENT_SCOPE) set(MGARD_X_SRC ${MGARD_X_SRC} PARENT_SCOPE) \ No newline at end of file diff --git a/src/mgard-x/RuntimeX/DeviceAdapters/CMakeLists.txt b/src/mgard-x/RuntimeX/DeviceAdapters/CMakeLists.txt index 651543fb27..eb71bc40f5 100644 --- a/src/mgard-x/RuntimeX/DeviceAdapters/CMakeLists.txt +++ b/src/mgard-x/RuntimeX/DeviceAdapters/CMakeLists.txt @@ -13,6 +13,11 @@ if(MGARD_ENABLE_HIP) ${CMAKE_CURRENT_SOURCE_DIR}/DeviceAdapterHip.cpp) set(MGARD_X_HIP_SRC ${MGARD_X_HIP_SRC} PARENT_SCOPE) endif() +if(MGARD_ENABLE_SYCL) + list(APPEND MGARD_X_SYCL_SRC + ${CMAKE_CURRENT_SOURCE_DIR}/DeviceAdapterSycl.cpp) + set(MGARD_X_SYCL_SRC ${MGARD_X_SYCL_SRC} PARENT_SCOPE) +endif() if(MGARD_ENABLE_KOKKOS) list(APPEND MGARD_X_KOKKOS_SRC ${CMAKE_CURRENT_SOURCE_DIR}/DeviceAdapterKokkos.cpp) diff --git a/src/mgard-x/RuntimeX/DeviceAdapters/DeviceAdapterSerial.cpp b/src/mgard-x/RuntimeX/DeviceAdapters/DeviceAdapterSerial.cpp index 2461062814..af16904532 100644 --- a/src/mgard-x/RuntimeX/DeviceAdapters/DeviceAdapterSerial.cpp +++ b/src/mgard-x/RuntimeX/DeviceAdapters/DeviceAdapterSerial.cpp @@ -9,20 +9,20 @@ namespace mgard_x { -int DeviceRuntime::curr_dev_id = 0; -DeviceQueues DeviceRuntime::queues; -DeviceSpecification DeviceRuntime::DeviceSpecs; +int DeviceRuntime::curr_dev_id = 0; +DeviceQueues DeviceRuntime::queues; +DeviceSpecification DeviceRuntime::DeviceSpecs; -bool DeviceRuntime::SyncAllKernelsAndCheckErrors = false; -bool MemoryManager::ReduceMemoryFootprint = false; -bool DeviceRuntime::TimingAllKernels = false; -bool DeviceRuntime::PrintKernelConfig = false; +bool DeviceRuntime::SyncAllKernelsAndCheckErrors = false; +bool MemoryManager::ReduceMemoryFootprint = false; +bool DeviceRuntime::TimingAllKernels = false; +bool DeviceRuntime::PrintKernelConfig = false; -AutoTuningTable AutoTuner::autoTuningTable; -bool AutoTuner::ProfileKernels = false; +AutoTuningTable AutoTuner::autoTuningTable; +bool AutoTuner::ProfileKernels = false; -template <> bool deviceAvailable() { - return DeviceRuntime::GetDeviceCount() > 0; +template <> bool deviceAvailable() { + return DeviceRuntime::GetDeviceCount() > 0; } } // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/RuntimeX/DeviceAdapters/DeviceAdapterSycl.cpp b/src/mgard-x/RuntimeX/DeviceAdapters/DeviceAdapterSycl.cpp new file mode 100644 index 0000000000..f950c2dc6f --- /dev/null +++ b/src/mgard-x/RuntimeX/DeviceAdapters/DeviceAdapterSycl.cpp @@ -0,0 +1,29 @@ +/* + * Copyright 2022, Oak Ridge National Laboratory. + * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs + * Author: Jieyang Chen (chenj3@ornl.gov) + * Date: March 17, 2022 + */ + +#include "mgard-x/RuntimeX/RuntimeX.h" + +namespace mgard_x { + +int DeviceRuntime::curr_dev_id = 0; +DeviceQueues DeviceRuntime::queues; +DeviceSpecification DeviceRuntime::DeviceSpecs; + +// SyncAllKernelsAndCheckErrors needs to be always ON for SYCL +bool DeviceRuntime::SyncAllKernelsAndCheckErrors = true; +bool MemoryManager::ReduceMemoryFootprint = false; +bool DeviceRuntime::TimingAllKernels = false; +bool DeviceRuntime::PrintKernelConfig = false; + +AutoTuningTable AutoTuner::autoTuningTable; +bool AutoTuner::ProfileKernels = false; + +template <> bool deviceAvailable() { + return DeviceRuntime::GetDeviceCount() > 0; +} + +} // namespace mgard_x \ No newline at end of file diff --git a/src/mgard-x/Utilities/CMakeLists.txt b/src/mgard-x/Utilities/CMakeLists.txt deleted file mode 100644 index d5fa582eab..0000000000 --- a/src/mgard-x/Utilities/CMakeLists.txt +++ /dev/null @@ -1,3 +0,0 @@ -list(APPEND MGARD_X_SRC - ${CMAKE_CURRENT_SOURCE_DIR}/CheckEndianess.cpp) -set(MGARD_X_SRC ${MGARD_X_SRC} PARENT_SCOPE) \ No newline at end of file diff --git a/src/mgard-x/Utilities/CheckEndianess.cpp b/src/mgard-x/Utilities/CheckEndianess.cpp deleted file mode 100644 index badd2524cf..0000000000 --- a/src/mgard-x/Utilities/CheckEndianess.cpp +++ /dev/null @@ -1,21 +0,0 @@ -/* - * Copyright 2022, Oak Ridge National Laboratory. - * MGARD-X: MultiGrid Adaptive Reduction of Data Portable across GPUs and CPUs - * Author: Jieyang Chen (chenj3@ornl.gov) - * Date: March 17, 2022 - */ - -#include "mgard-x/Types.h" - -namespace mgard_x { -enum endiness_type CheckEndianess() { - int i = 1; - char *p = (char *)&i; - if (p[0] == 1) { - return endiness_type::Little_Endian; - } else { - return endiness_type::Big_Endian; - } -} - -} // namespace mgard_x