diff --git a/CMakeLists.txt b/CMakeLists.txt index d6ea3fee9..ff5d239d5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,4 +1,4 @@ -cmake_minimum_required (VERSION 3.19) +cmake_minimum_required (VERSION 3.21) project (asgard VERSION 0.3.0 @@ -12,7 +12,7 @@ include (FetchContent) find_package (Git) # Define a macro to register new projects. -function (register_project name dir url default_tag) +function (register_project name dir url default_tag make_avail) set (BUILD_TAG_${dir} ${default_tag} CACHE STRING "Name of the tag to checkout.") set (BUILD_REPO_${dir} ${url} CACHE STRING "URL of the repo to clone.") @@ -24,7 +24,9 @@ function (register_project name dir url default_tag) SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/contrib/${dir} ) - FetchContent_MakeAvailable(${name}) + if (${make_avail}) + FetchContent_MakeAvailable(${name}) + endif() endfunction () # Changes to the current version of kromult should proceed through a pull @@ -33,6 +35,7 @@ register_project (kronmult KRONMULT https://github.com/project-asgard/kronmult.git f941819685bbd3026a85145dde286f593683c1f4 + OFF ) ############################################################################### @@ -78,7 +81,7 @@ option (ASGARD_PROFILE_PERF "enable profiling support for using linux perf" "") option (ASGARD_PROFILE_VALGRIND "enable profiling support for using valgrind" "") option (ASGARD_GRAPHVIZ_PATH "optional location of bin/ containing dot executable" "") option (ASGARD_IO_HIGHFIVE "Use the HighFive HDF5 header library for I/O" OFF) -option (ASGARD_USE_CUDA "Optional CUDA support for asgard" OFF) +option (ASGARD_USE_HIP "Optional HIP support for asgard" OFF) option (ASGARD_USE_OPENMP "Optional openMP support for asgard" ON) option (ASGARD_USE_MPI "Optional distributed computing support for asgard" OFF) include(CMakeDependentOption) @@ -136,30 +139,173 @@ if(ASGARD_USE_OPENMP) endif() endif() -if(ASGARD_USE_CUDA) - find_package(CUDA 9.0 REQUIRED) # eventually want to remove this - how to set min version with enable_language? - include_directories(${CUDA_INCLUDE_DIRS}) - enable_language(CUDA) - set (CMAKE_CUDA_STANDARD 14) - set (CMAKE_CUDA_STANDARD_REQUIRED ON) -endif() +# convenience flags for which HIP platform has been setup +set(ASGARD_PLATFORM_NVCC 0) +set(ASGARD_PLATFORM_AMD 0) +if(ASGARD_USE_HIP) + # search for HIP and libraries + if(NOT DEFINED HIP_PATH) + if(NOT DEFINED ENV{HIP_PATH}) + set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed") + else() + set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed") + endif() + endif() -# build component to interface with Ed's kronmult lib -##TODO: link to kronmult as interface library -add_library(kronmult_cuda SHARED src/device/kronmult_cuda.cpp) -if(ASGARD_USE_CUDA) - set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES LANGUAGE CUDA ) # no .cu extension - set_target_properties( kronmult_cuda PROPERTIES CUDA_ARCHITECTURES OFF) - set_target_properties( kronmult_cuda PROPERTIES COMPILE_FLAGS "-arch sm_70 -g -lineinfo --ptxas-options=-O3") - set_target_properties( kronmult_cuda PROPERTIES LINK_FLAGS "-Wl,-rpath,${CMAKE_BINARY_DIR}") + # set HIP_CLANG_PATH for potential installs in non-standard locations (such as rocm with spack) + if (NOT DEFINED HIP_CLANG_PATH) + if(NOT DEFINED ENV{HIP_CLANG_PATH}) + set(HIP_CLANG_PATH "${ROCM_PATH}/llvm/bin" CACHE PATH "Path to HIP clang binaries") + else() + set(HIP_CLANG_PATH $ENV{HIP_CLANG_PATH} CACHE PATH "Path to HIP clang binaries") + endif() + endif() + + # note: could probably grab this path directly using hipconfig? + if (NOT DEFINED HIP_CLANG_INCLUDE_PATH) + if(NOT DEFINED ENV{HIP_CLANG_INCLUDE_PATH}) + # probably need a better way to get the compiler version.. this will cause non-existent paths for non-clang compilers + set(HIP_CLANG_INCLUDE_PATH "${HIP_CLANG_PATH}/../lib/clang/${CMAKE_CXX_COMPILER_VERSION}/include" CACHE PATH "Path to HIP clang include directory") + else() + set(HIP_CLANG_INCLUDE_PATH $ENV{HIP_CLANG_INCLUDE_PATH} CACHE PATH "Path to HIP clang include directory") + endif() + endif() + + if(NOT DEFINED HIPBLAS_PATH) + if(NOT DEFINED ENV{HIPBLAS_PATH}) + set(HIPBLAS_PATH "${HIP_PATH}/../hipblas" CACHE PATH "Path to which HIPBLAS has been installed") + else() + set(HIPBLAS_PATH $ENV{HIPBLAS_PATH} CACHE PATH "Path to which HIPBLAS has been installed") + endif() + endif() + + # try to find hipconfig executable which can help detect platforms and include dirs + find_program(ASGARD_HIPCONFIG_PATH hipconfig HINTS "${HIP_PATH}/bin") + if(ASGARD_HIPCONFIG_PATH) + execute_process(COMMAND ${ASGARD_HIPCONFIG_PATH} --platform OUTPUT_VARIABLE ASGARD_HIP_PLATFORM) + elseif(DEFINED ENV{HIP_PLATFORM}) + set(ASGARD_HIP_PLATFORM "$ENV{HIP_PLATFORM}") + else() + message(FATAL_ERROR "Could not determine HIP platform, make sure HIP_PLATFORM is set") + endif() + + message(STATUS "HIP platform has been detected as ${ASGARD_HIP_PLATFORM}") + # hip >= 4.2 is now using "amd" to identify platform + if(ASGARD_HIP_PLATFORM STREQUAL "hcc" OR ASGARD_HIP_PLATFORM STREQUAL "amd") + set(ASGARD_PLATFORM_AMD 1) + # hip <= 4.1 uses "nvcc" to identify nvidia platforms, >= 4.2 uses "nvidia" + elseif(ASGARD_HIP_PLATFORM STREQUAL "nvcc" OR ASGARD_HIP_PLATFORM STREQUAL "nvidia") + set(ASGARD_PLATFORM_NVCC 1) + endif() + + # double check for cuda path since HIP uses it internally + if(ASGARD_PLATFORM_NVCC) + if (NOT DEFINED ENV{CUDA_PATH}) + find_path(ASGARD_HIP_DEFAULT_CUDA_PATH "cuda.h" PATH /usr/local/cuda/include) + if (NOT ASGARD_HIP_DEFAULT_CUDA_PATH) + message(FATAL_ERROR "Make sure the CUDA_PATH env is set to locate for HIP") + endif() + endif() + message(STATUS "Found CUDA_PATH: $ENV{CUDA_PATH}") + endif() + + # look for HIP cmake configs in different locations + list(APPEND CMAKE_MODULE_PATH "${HIP_PATH}/cmake" "${ROCM_PATH}") + if(ASGARD_PLATFORM_AMD) + # note: causes issues on nvidia, but might be needed on amd platforms? + list(APPEND CMAKE_PREFIX_PATH "${HIP_PATH}/lib/cmake" "${ROCM_PATH}") + + # output a warning if compiling for AMD without using amd-clang + if(NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang") + message(WARNING "Compiling HIP for AMD without using AMD clang might not work. Use -DCMAKE_CXX_COMPILER=clang++") + endif() + endif() + list(APPEND CMAKE_PREFIX_PATH "${HIPBLAS_PATH}/lib/cmake") + + set(HIP_VERBOSE_BUILD ON CACHE STRING "Verbose compilation for HIP") + + set(ASGARD_HIP_FLAGS "-std=c++14;-g" CACHE STRING "HIP compiler flags for both AMD and NVIDIA") + set(GPU_ARCH "70" CACHE STRING "AMD/NVIDIA GPU architecture number (such as 906 or 70)") + + if(ASGARD_PLATFORM_AMD) + set(AMDGPU_TARGETS "gfx${GPU_ARCH}" CACHE STRING "GPU target architectures to compile for" FORCE) + set(GPU_TARGETS "gfx${GPU_ARCH}" CACHE STRING "GPU target architectures to compile for" FORCE) + + # need a much later version for AMD since ipiv=nullptr fix not in hipblas until >4.3.1 + find_package(HIP 4.3.0 REQUIRED) + find_package(hipblas 0.49 REQUIRED) + else() + find_package(HIP 4.0 REQUIRED) + find_package(hipblas REQUIRED) + endif() + # Print some debug info about HIP configuration + message(STATUS "HIP PLATFORM: ${HIP_PLATFORM}") + message(STATUS "HIP COMPILER: ${HIP_COMPILER}") + message(STATUS "HIP RUNTIME: ${HIP_RUNTIME}") + message(STATUS "HIP Includes: ${HIP_INCLUDE_DIRS}") + message(STATUS "HIP Libraries: ${HIP_LIBRARIES}") + + if(ASGARD_PLATFORM_NVCC) + enable_language(CUDA) + set (CMAKE_CUDA_STANDARD 14) + set (CMAKE_CUDA_STANDARD_REQUIRED ON) + add_compile_definitions(__HIP_PLATFORM_NVCC__ __HIP_PLATFORM_NVIDIA__) + set(ASGARD_NVCC_FLAGS "${ASGARD_HIP_FLAGS}; -gencode arch=compute_${GPU_ARCH},code=compute_${GPU_ARCH} --ptxas-options=-O3 -lineinfo" CACHE STRING "Flags to pass to NVCC" FORCE) # nvcc specific options + elseif(ASGARD_PLATFORM_AMD) + #enable_language(HIP) # not yet added to latest cmake, but should be available in 3.21 + # these compile definitions should be added automatically if using amd's clang, but + # may not necessarily be added if compiling with gcc or others + add_compile_definitions(__HIP_PLATFORM_HCC__ __HIP_PLATFORM_AMD__) + set(ASGARD_AMD_FLAGS "${ASGARD_HIP_FLAGS}; --amdgpu-target=gfx${GPU_ARCH};-O3" CACHE STRING "Flags to pass to amd-clang for HIP" FORCE) # amdgpu specific options + endif() + + if (hipBLAS_FOUND) + message(STATUS "Found rocBLAS version ${rocBLAS_VERSION}: ${HIPBLAS_INCLUDE_DIRS}") + endif() + + # set source file language properties + if(ASGARD_PLATFORM_AMD) + #set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES LANGUAGE HIP ) # should work after cmake 3.21 release? + set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1) + elseif(ASGARD_PLATFORM_NVCC) + set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES LANGUAGE CUDA ) # no .cu extension + endif() # Turn on GPU support in kronmult. set (USE_GPU ON CACHE BOOL "Turn on kronmult gpu support" FORCE) endif() +# Fetch kronmult after configuring everything, but before adding libraries +FetchContent_MakeAvailable(kronmult) + +if(ASGARD_USE_HIP) + set(CMAKE_HIP_ARCHITECTURES OFF) + set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} ${ASGARD_HIP_FLAGS}" CACHE STRING "") + set(HIP_CLANG_FLAGS "${HIP_CLANG_FLAGS} ${ASGARD_AMD_FLAGS}" CACHE STRING "") + set(HIP_NVCC_FLAGS "${HIP_NVCC_FLAGS} ${ASGARD_NVCC_FLAGS}" CACHE STRING "") + + hip_add_library(kronmult_cuda SHARED src/device/kronmult_cuda.cpp + HIPCC_OPTIONS "${ASGARD_HIP_FLAGS}" + NVCC_OPTIONS "${ASGARD_NVCC_FLAGS}" + CLANG_OPTIONS "${ASGARD_AMD_FLAGS}") + target_include_directories(kronmult_cuda PUBLIC ${HIP_INCLUDE_DIRS}) + # assume this include path since HIP_INCLUDE_DIRS is not being set on nvidia platform + target_include_directories(kronmult_cuda PUBLIC "${HIP_PATH}/include") + target_include_directories(kronmult_cuda PUBLIC ${HIPBLAS_INCLUDE_DIRS}) + if(ASGARD_PLATFORM_NVCC) + set_target_properties( kronmult_cuda PROPERTIES CUDA_ARCHITECTURES OFF) + set_target_properties( kronmult_cuda PROPERTIES COMPILE_FLAGS "-arch sm_${GPU_ARCH} -g -lineinfo --ptxas-options=-O3") + endif() + set_target_properties( kronmult_cuda PROPERTIES LINK_FLAGS "-Wl,-rpath,${CMAKE_BINARY_DIR}") +else() + # build component to interface with Ed's kronmult lib + ##TODO: link to kronmult as interface library + add_library(kronmult_cuda SHARED src/device/kronmult_cuda.cpp) +endif() + if(ASGARD_USE_MKL) - if(ASGARD_USE_CUDA) - set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --compiler-options -fopenmp") + if(ASGARD_USE_HIP AND ASGARD_PLATFORM_NVCC) + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --compiler-options -fopenmp") else() target_compile_options (kronmult_cuda PRIVATE "-fopenmp") # CMAKE doesn't handle MKL openmp link properly if(APPLE) # Need to link against the same openmp library as the MKL. @@ -228,7 +374,22 @@ if (ASGARD_USE_MATLAB) endif () foreach (component IN LISTS components) - add_library (${component} src/${component}.cpp) + if (ASGARD_USE_HIP) + hip_add_library (${component} src/${component}.cpp + HIPCC_OPTIONS "${ASGARD_HIP_FLAGS}" + NVCC_OPTIONS "${ASGARD_NVCC_FLAGS} -lhipblas" + CLANG_OPTIONS "${ASGARD_AMD_FLAGS}") + set_target_properties(${component} PROPERTIES LINKER_LANGUAGE HIP) + target_include_directories(${component} SYSTEM PUBLIC ${HIP_INCLUDE_DIRS}) + target_include_directories(${component} SYSTEM PUBLIC "${HIP_PATH}/include") + target_include_directories(${component} SYSTEM PUBLIC ${HIPBLAS_INCLUDE_DIRS}) + if(ASGARD_PLATFORM_NVCC) + target_include_directories(${component} SYSTEM PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) + endif() + target_link_directories(${component} PRIVATE ${HIPBLAS_PATH}/lib/) # roc::hipblas target isn't linking properly on nvidia platforms + else() + add_library (${component} src/${component}.cpp) + endif() target_include_directories (${component} PRIVATE ${CMAKE_BINARY_DIR}) if(ASGARD_USE_MKL) target_compile_options (${component} PRIVATE "-fopenmp") # CMAKE doesn't handle MKL openmp link properly @@ -239,9 +400,7 @@ if (build_hdf5) add_dependencies (io hdf5-ext) endif () -if (build_kron) - add_dependencies (kronmult_cuda kronmult-ext) -endif () +add_dependencies (kronmult_cuda kron) if (ASGARD_USE_SCALAPACK) target_link_libraries (tensors PRIVATE scalapack_matrix_info cblacs_grid) @@ -281,7 +440,11 @@ if (ASGARD_USE_OPENMP AND NOT ASGARD_USE_MKL) target_link_libraries(kronmult PRIVATE OpenMP::OpenMP_CXX) endif () -target_link_libraries(kronmult_cuda PUBLIC kron) +if (ASGARD_USE_HIP AND ASGARD_PLATFORM_AMD) + target_link_libraries(kronmult_cuda PUBLIC kron hip::device) +else () + target_link_libraries(kronmult_cuda PRIVATE kron) +endif() if (ASGARD_USE_OPENMP AND NOT ASGARD_USE_MKL) target_link_libraries(kronmult_cuda PRIVATE OpenMP::OpenMP_CXX) @@ -302,9 +465,11 @@ else () target_link_libraries (lib_dispatch PRIVATE LINALG::LINALG) endif () -if (ASGARD_USE_CUDA) - target_link_libraries(lib_dispatch PRIVATE ${CUDA_LIBRARIES} - ${CUDA_CUBLAS_LIBRARIES}) +if (ASGARD_USE_HIP) + if(ASGARD_PLATFORM_AMD) + target_link_libraries(lib_dispatch PRIVATE hip::device) + target_link_libraries(lib_dispatch PRIVATE roc::hipblas) + endif() endif() if (ASGARD_USE_OPENMP AND NOT ASGARD_USE_MKL) @@ -330,8 +495,11 @@ target_link_libraries (quadrature PRIVATE matlab_utilities tensors) target_link_libraries (solver PRIVATE distribution fast_math lib_dispatch tensors) target_link_libraries (tensors PRIVATE lib_dispatch) -if (ASGARD_USE_CUDA) - target_link_libraries (tensors PRIVATE ${CUDA_LIBRARIES}) +if (ASGARD_USE_HIP) + if(ASGARD_PLATFORM_AMD) + target_link_libraries(tensors PRIVATE hip::device) + target_link_libraries (tensors PRIVATE roc::hipblas) + endif() endif () if (ASGARD_USE_SCALAPACK) add_compile_definitions (ASGARD_USE_SCALAPACK) @@ -345,7 +513,16 @@ target_link_libraries (transformations quadrature tensors) # define the main application and its linking -add_executable (asgard src/main.cpp) +if (ASGARD_USE_HIP) + hip_add_executable (asgard src/main.cpp + HIPCC_OPTIONS "${ASGARD_HIP_FLAGS}" + NVCC_OPTIONS "${ASGARD_NVCC_FLAGS} -lhipblas" + CLANG_OPTIONS "${ASGARD_AMD_FLAGS}") + set_target_properties(asgard PROPERTIES LINKER_LANGUAGE HIP) + target_link_directories(asgard PRIVATE ${HIPBLAS_PATH}/lib/) +else() + add_executable (asgard src/main.cpp) +endif() # link in components needed directly by main set (main_app_link_deps @@ -400,7 +577,21 @@ if (ASGARD_BUILD_TESTS) enable_testing () # Define ctest tests and their executables - add_library (tests_general testing/tests_general.cpp) + if (ASGARD_USE_HIP) + hip_add_library (tests_general testing/tests_general.cpp + HIPCC_OPTIONS "${ASGARD_HIP_FLAGS}" + NVCC_OPTIONS "${ASGARD_NVCC_FLAGS}" + CLANG_OPTIONS "${ASGARD_AMD_FLAGS}") + target_include_directories(tests_general SYSTEM PUBLIC ${HIP_INCLUDE_DIRS}) + target_include_directories(tests_general SYSTEM PUBLIC "${HIP_PATH}/include") + target_include_directories(tests_general SYSTEM PUBLIC ${HIPBLAS_INCLUDE_DIRS}) + if (ASGARD_PLATFORM_NVCC) + target_include_directories(tests_general SYSTEM PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) + target_link_libraries(tests_general PRIVATE ${CMAKE_CUDA_RUNTIME_LIBRARY}) + endif() + else() + add_library (tests_general testing/tests_general.cpp) + endif() target_link_libraries (tests_general PUBLIC Catch PRIVATE pde program_options ) target_include_directories(tests_general PRIVATE ${CMAKE_BINARY_DIR}) @@ -416,7 +607,22 @@ if (ASGARD_BUILD_TESTS) endif() foreach (component IN LISTS components) - add_executable (${component}-tests src/${component}_tests.cpp) + if (ASGARD_USE_HIP) + hip_add_executable (${component}-tests src/${component}_tests.cpp + HIPCC_OPTIONS "${ASGARD_HIP_FLAGS}" + NVCC_OPTIONS "${ASGARD_NVCC_FLAGS} -lhipblas" + CLANG_OPTIONS "${ASGARD_AMD_FLAGS}") + if (ASGARD_PLATFORM_NVCC) + target_include_directories(${component}-tests SYSTEM PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) + target_link_libraries(${component}-tests PRIVATE ${CMAKE_CUDA_RUNTIME_LIBRARY}) + target_link_directories(${component}-tests PRIVATE ${HIPBLAS_PATH}/lib/) + endif() + target_include_directories(${component}-tests SYSTEM PUBLIC ${HIP_INCLUDE_DIRS}) + target_include_directories(${component}-tests SYSTEM PUBLIC "${HIP_PATH}/include") + target_include_directories(${component}-tests SYSTEM PUBLIC ${HIPBLAS_INCLUDE_DIRS}) + else() + add_executable (${component}-tests src/${component}_tests.cpp) + endif() target_include_directories (${component}-tests PRIVATE ${CMAKE_SOURCE_DIR}/testing) target_include_directories (${component}-tests PRIVATE ${CMAKE_BINARY_DIR}) target_link_libraries (${component}-tests PRIVATE ${component} tests_general) @@ -432,7 +638,7 @@ if (ASGARD_BUILD_TESTS) target_link_libraries (${component}-tests PRIVATE ${component} MPI::MPI_CXX) if (${component} IN_LIST mpi_test_components) set(test_ranks "4") - if (ASGARD_USE_CUDA) + if (ASGARD_USE_HIP) set(test_ranks "1") endif () if (${ASGARD_TESTING_RANKS}) @@ -450,7 +656,22 @@ if (ASGARD_BUILD_TESTS) WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} ) endforeach () - add_executable (kronmult_cuda-tests src/device/kronmult_cuda_tests.cpp) + if (ASGARD_USE_HIP) + hip_add_executable (kronmult_cuda-tests src/device/kronmult_cuda_tests.cpp + HIPCC_OPTIONS "${ASGARD_HIP_FLAGS}" + NVCC_OPTIONS "${ASGARD_NVCC_FLAGS} -lhipblas" + CLANG_OPTIONS "${ASGARD_AMD_FLAGS}") + if (ASGARD_PLATFORM_NVCC) + target_include_directories(kronmult_cuda-tests SYSTEM PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}) + target_link_libraries(kronmult_cuda-tests PRIVATE ${CMAKE_CUDA_RUNTIME_LIBRARY}) + target_link_directories(kronmult_cuda-tests PRIVATE ${HIPBLAS_PATH}/lib/) + endif() + target_include_directories(kronmult_cuda-tests SYSTEM PUBLIC ${HIP_INCLUDE_DIRS}) + target_include_directories(kronmult_cuda-tests SYSTEM PUBLIC "${HIP_PATH}/include") + target_include_directories(kronmult_cuda-tests SYSTEM PUBLIC ${HIPBLAS_INCLUDE_DIRS}) + else() + add_executable (kronmult_cuda-tests src/device/kronmult_cuda_tests.cpp) + endif() target_include_directories (kronmult_cuda-tests PRIVATE ${CMAKE_SOURCE_DIR}/testing) target_include_directories (kronmult_cuda-tests PRIVATE ${CMAKE_BINARY_DIR}) target_link_libraries (kronmult_cuda-tests PRIVATE coefficients kronmult_cuda tests_general) diff --git a/contrib/FindLINALG.cmake b/contrib/FindLINALG.cmake index b92d94607..7f1f06af4 100644 --- a/contrib/FindLINALG.cmake +++ b/contrib/FindLINALG.cmake @@ -54,6 +54,7 @@ if (${ASGARD_BUILD_OPENBLAS}) OPENBLAS https://github.com/xianyi/OpenBLAS.git v0.3.18 + ON ) # Fetch content does not run the install phase so the headers for openblas are diff --git a/src/batch_tests.cpp b/src/batch_tests.cpp index 757b3d4e8..4739b0f79 100644 --- a/src/batch_tests.cpp +++ b/src/batch_tests.cpp @@ -864,7 +864,11 @@ void test_batched_gemv(int const m, int const n, int const lda, batched_gemv(a_batch, x_batch, y_batch, alpha, beta); - P const tol_factor = 1e-17; + P tol_factor = 1e-17; + if constexpr (resrc == resource::device) + { + tol_factor = 1e-7; + } for (int i = 0; i < num_batch; ++i) { if constexpr (resrc == resource::host) diff --git a/src/build_info.hpp.in b/src/build_info.hpp.in index 089a874cd..019abe0b7 100644 --- a/src/build_info.hpp.in +++ b/src/build_info.hpp.in @@ -6,7 +6,7 @@ #define BUILD_TIME "@BUILD_TIME@" #cmakedefine ASGARD_IO_HIGHFIVE -#cmakedefine ASGARD_USE_CUDA +#cmakedefine ASGARD_USE_HIP #cmakedefine ASGARD_USE_OPENMP #cmakedefine ASGARD_USE_MPI #cmakedefine ASGARD_USE_MATLAB diff --git a/src/device/kronmult_cuda.cpp b/src/device/kronmult_cuda.cpp index 9cc8a9d85..fe6573a2c 100644 --- a/src/device/kronmult_cuda.cpp +++ b/src/device/kronmult_cuda.cpp @@ -1,9 +1,8 @@ #include "kronmult_cuda.hpp" #include "build_info.hpp" -#ifdef ASGARD_USE_CUDA -#include -#include +#ifdef ASGARD_USE_HIP +#include #define USE_GPU #define GLOBAL_FUNCTION __global__ #define SYNCTHREADS __syncthreads() @@ -47,7 +46,7 @@ GLOBAL_FUNCTION void stage_inputs_kronmult_kernel(P const *const x, P *const workspace, int const num_elems, int const num_copies) { -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP expect(blockIdx.y == 0); expect(blockIdx.z == 0); @@ -90,7 +89,7 @@ void stage_inputs_kronmult(P const *const x, P *const workspace, expect(num_elems > 0); expect(num_copies > 0); -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP auto constexpr warp_size = 32; auto constexpr num_warps = 8; @@ -99,11 +98,12 @@ void stage_inputs_kronmult(P const *const x, P *const workspace, auto const total_copies = static_cast(num_elems) * num_copies; auto const num_blocks = (total_copies + num_threads - 1) / num_threads; - stage_inputs_kronmult_kernel

- <<>>(x, workspace, num_elems, num_copies); + hipLaunchKernelGGL(HIP_KERNEL_NAME(stage_inputs_kronmult_kernel

), + dim3(num_blocks), dim3(num_threads), 0, 0, x, workspace, + num_elems, num_copies); - auto const stat = cudaDeviceSynchronize(); - expect(stat == cudaSuccess); + auto const stat = hipDeviceSynchronize(); + expect(stat == hipSuccess); #else stage_inputs_kronmult_kernel(x, workspace, num_elems, num_copies); #endif @@ -175,7 +175,7 @@ prepare_kronmult_kernel(int const *const flattened_table, auto const coord_size = num_dims * 2; auto const num_elems = static_cast(num_cols) * num_rows; -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP expect(blockIdx.y == 0); expect(blockIdx.z == 0); @@ -192,7 +192,7 @@ prepare_kronmult_kernel(int const *const flattened_table, auto const increment = 1; #endif -#ifndef ASGARD_USE_CUDA +#ifndef ASGARD_USE_HIP #ifdef ASGARD_USE_OPENMP #pragma omp parallel for #endif @@ -273,7 +273,7 @@ void prepare_kronmult(int const *const flattened_table, expect(input_ptrs); expect(output_ptrs); -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP auto constexpr warp_size = 32; auto constexpr num_warps = 8; auto constexpr num_threads = num_warps * warp_size; @@ -281,12 +281,14 @@ void prepare_kronmult(int const *const flattened_table, static_cast(elem_col_stop - elem_col_start + 1) * (elem_row_stop - elem_row_start + 1); auto const num_blocks = (num_krons / num_threads) + 1; - prepare_kronmult_kernel

<<>>( - flattened_table, operators, operator_lda, element_x, element_work, fx, - operator_ptrs, work_ptrs, input_ptrs, output_ptrs, degree, num_terms, - num_dims, elem_row_start, elem_row_stop, elem_col_start, elem_col_stop); - auto const stat = cudaDeviceSynchronize(); - expect(stat == cudaSuccess); + hipLaunchKernelGGL(HIP_KERNEL_NAME(prepare_kronmult_kernel

), + dim3(num_blocks), dim3(num_threads), 0, 0, flattened_table, + operators, operator_lda, element_x, element_work, fx, + operator_ptrs, work_ptrs, input_ptrs, output_ptrs, degree, + num_terms, num_dims, elem_row_start, elem_row_stop, + elem_col_start, elem_col_stop); + auto const stat = hipDeviceSynchronize(); + expect(stat == hipSuccess); #else prepare_kronmult_kernel( flattened_table, operators, operator_lda, element_x, element_work, fx, @@ -304,7 +306,7 @@ void call_kronmult(int const n, P *x_ptrs[], P *output_ptrs[], P *work_ptrs[], P const *const operator_ptrs[], int const lda, int const num_krons, int const num_dims) { -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP { int constexpr warpsize = 32; int constexpr nwarps = 1; @@ -313,28 +315,40 @@ void call_kronmult(int const n, P *x_ptrs[], P *output_ptrs[], P *work_ptrs[], switch (num_dims) { case 1: - kronmult1_xbatched

<<>>( - n, operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, num_krons); + hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult1_xbatched

), + dim3(num_krons), dim3(num_threads), 0, 0, n, + operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, + num_krons); break; case 2: - kronmult2_xbatched

<<>>( - n, operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, num_krons); + hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult2_xbatched

), + dim3(num_krons), dim3(num_threads), 0, 0, n, + operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, + num_krons); break; case 3: - kronmult3_xbatched

<<>>( - n, operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, num_krons); + hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult3_xbatched

), + dim3(num_krons), dim3(num_threads), 0, 0, n, + operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, + num_krons); break; case 4: - kronmult4_xbatched

<<>>( - n, operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, num_krons); + hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult4_xbatched

), + dim3(num_krons), dim3(num_threads), 0, 0, n, + operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, + num_krons); break; case 5: - kronmult5_xbatched

<<>>( - n, operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, num_krons); + hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult5_xbatched

), + dim3(num_krons), dim3(num_threads), 0, 0, n, + operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, + num_krons); break; case 6: - kronmult6_xbatched

<<>>( - n, operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, num_krons); + hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult6_xbatched

), + dim3(num_krons), dim3(num_threads), 0, 0, n, + operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, + num_krons); break; default: expect(false); @@ -343,8 +357,8 @@ void call_kronmult(int const n, P *x_ptrs[], P *output_ptrs[], P *work_ptrs[], // ------------------------------------------- // note important to wait for kernel to finish // ------------------------------------------- - auto const stat = cudaDeviceSynchronize(); - expect(stat == cudaSuccess); + auto const stat = hipDeviceSynchronize(); + expect(stat == hipSuccess); } #else diff --git a/src/kronmult.cpp b/src/kronmult.cpp index 44d8f0098..a040c28f4 100644 --- a/src/kronmult.cpp +++ b/src/kronmult.cpp @@ -3,8 +3,8 @@ #include "lib_dispatch.hpp" #include "tools.hpp" -#ifdef ASGARD_USE_CUDA -#include +#ifdef ASGARD_USE_HIP +#include #endif #ifdef ASGARD_USE_OPENMP diff --git a/src/kronmult.hpp b/src/kronmult.hpp index 47e2f46f3..ad6cdbbf7 100644 --- a/src/kronmult.hpp +++ b/src/kronmult.hpp @@ -1,5 +1,5 @@ #pragma once -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP #define USE_GPU #endif #include "distribution.hpp" diff --git a/src/lib_dispatch.cpp b/src/lib_dispatch.cpp index 15277a3b0..b94ee451a 100644 --- a/src/lib_dispatch.cpp +++ b/src/lib_dispatch.cpp @@ -60,9 +60,9 @@ extern "C" #pragma GCC diagnostic pop #endif -#ifdef ASGARD_USE_CUDA -#include -#include +#ifdef ASGARD_USE_HIP +#include +#include #endif #ifdef ASGARD_USE_SCALAPACK @@ -93,61 +93,61 @@ struct device_handler { device_handler() { -#ifdef ASGARD_USE_CUDA - auto success = cublasCreate(&handle); - expect(success == CUBLAS_STATUS_SUCCESS); +#ifdef ASGARD_USE_HIP + auto success = hipblasCreate(&handle); + expect(success == HIPBLAS_STATUS_SUCCESS); - success = cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST); - expect(success == CUBLAS_STATUS_SUCCESS); + success = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST); + expect(success == HIPBLAS_STATUS_SUCCESS); #endif } void set_device(int const local_rank) { -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP int num_devices; - auto success = cudaGetDeviceCount(&num_devices); + auto success = hipGetDeviceCount(&num_devices); - expect(success == cudaSuccess); + expect(success == hipSuccess); expect(local_rank >= 0); expect(local_rank < num_devices); if (handle) { - auto const cublas_success = cublasDestroy(handle); - expect(cublas_success == CUBLAS_STATUS_SUCCESS); + auto const hipblas_success = hipblasDestroy(handle); + expect(hipblas_success == HIPBLAS_STATUS_SUCCESS); } - success = cudaSetDevice(local_rank); - expect(success == cudaSuccess); - auto const cublas_success = cublasCreate(&handle); - expect(cublas_success == CUBLAS_STATUS_SUCCESS); + success = hipSetDevice(local_rank); + expect(success == hipSuccess); + auto const hipblas_success = hipblasCreate(&handle); + expect(hipblas_success == HIPBLAS_STATUS_SUCCESS); #else ignore(local_rank); #endif } -#ifdef ASGARD_USE_CUDA - cublasHandle_t const &get_handle() const { return handle; } +#ifdef ASGARD_USE_HIP + hipblasHandle_t const &get_handle() const { return handle; } #endif ~device_handler() { -#ifdef ASGARD_USE_CUDA - cublasDestroy(handle); +#ifdef ASGARD_USE_HIP + hipblasDestroy(handle); #endif } private: -#ifdef ASGARD_USE_CUDA - cublasHandle_t handle; +#ifdef ASGARD_USE_HIP + hipblasHandle_t handle; #endif }; static device_handler device; void initialize_libraries(int const local_rank) { -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP expect(local_rank >= 0); device.set_device(local_rank); #else @@ -155,16 +155,16 @@ void initialize_libraries(int const local_rank) #endif } -#ifdef ASGARD_USE_CUDA -inline cublasOperation_t cublas_trans(char trans) +#ifdef ASGARD_USE_HIP +inline hipblasOperation_t hipblas_trans(char trans) { if (trans == 'N' || trans == 'n') { - return CUBLAS_OP_N; + return HIPBLAS_OP_N; } else { - return CUBLAS_OP_T; + return HIPBLAS_OP_T; } } #endif @@ -182,16 +182,16 @@ void rotg(P *a, P *b, P *c, P *s, resource const resrc) if (resrc == resource::device) { // device-specific specialization if needed -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP // function instantiated for these two fp types if constexpr (std::is_same::value) { - auto const success = cublasDrotg(device.get_handle(), a, b, c, s); + auto const success = hipblasDrotg(device.get_handle(), a, b, c, s); expect(success == 0); } else if constexpr (std::is_same::value) { - auto const success = cublasSrotg(device.get_handle(), a, b, c, s); + auto const success = hipblasSrotg(device.get_handle(), a, b, c, s); expect(success == 0); } return; @@ -219,7 +219,7 @@ P nrm2(int *n, P *x, int *incx, resource const resrc) if (resrc == resource::device) { // device-specific specialization if needed -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP // no non-fp blas on device expect(std::is_floating_point_v

); P norm; @@ -227,13 +227,13 @@ P nrm2(int *n, P *x, int *incx, resource const resrc) if constexpr (std::is_same::value) { auto const success = - cublasDnrm2(device.get_handle(), *n, x, *incx, &norm); + hipblasDnrm2(device.get_handle(), *n, x, *incx, &norm); expect(success == 0); } else if constexpr (std::is_same::value) { auto const success = - cublasSnrm2(device.get_handle(), *n, x, *incx, &norm); + hipblasSnrm2(device.get_handle(), *n, x, *incx, &norm); expect(success == 0); } return norm; @@ -272,7 +272,7 @@ void copy(int *n, P *x, int *incx, P *y, int *incy, resource const resrc) if (resrc == resource::device) { // device-specific specialization if needed -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP // no non-fp blas on device expect(std::is_floating_point_v

); @@ -280,13 +280,13 @@ void copy(int *n, P *x, int *incx, P *y, int *incy, resource const resrc) if constexpr (std::is_same::value) { auto const success = - cublasDcopy(device.get_handle(), *n, x, *incx, y, *incy); + hipblasDcopy(device.get_handle(), *n, x, *incx, y, *incy); expect(success == 0); } else if constexpr (std::is_same::value) { auto const success = - cublasScopy(device.get_handle(), *n, x, *incx, y, *incy); + hipblasScopy(device.get_handle(), *n, x, *incx, y, *incy); expect(success == 0); } return; @@ -323,7 +323,7 @@ P dot(int *n, P *x, int *incx, P *y, int *incy, resource const resrc) if (resrc == resource::device) { // device-specific specialization if needed -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP // no non-fp blas on device expect(std::is_floating_point_v

); @@ -332,13 +332,13 @@ P dot(int *n, P *x, int *incx, P *y, int *incy, resource const resrc) if constexpr (std::is_same::value) { auto const success = - cublasDdot(device.get_handle(), *n, x, *incx, y, *incy, &result); + hipblasDdot(device.get_handle(), *n, x, *incx, y, *incy, &result); expect(success == 0); } else if constexpr (std::is_same::value) { auto const success = - cublasSdot(device.get_handle(), *n, x, *incx, y, *incy, &result); + hipblasSdot(device.get_handle(), *n, x, *incx, y, *incy, &result); expect(success == 0); } return result; @@ -379,7 +379,7 @@ void axpy(int *n, P *alpha, P *x, int *incx, P *y, int *incy, if (resrc == resource::device) { // device-specific specialization if needed -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP // no non-fp blas on device expect(std::is_floating_point_v

); @@ -387,13 +387,13 @@ void axpy(int *n, P *alpha, P *x, int *incx, P *y, int *incy, if constexpr (std::is_same::value) { auto const success = - cublasDaxpy(device.get_handle(), *n, alpha, x, *incx, y, *incy); + hipblasDaxpy(device.get_handle(), *n, alpha, x, *incx, y, *incy); expect(success == 0); } else if constexpr (std::is_same::value) { auto const success = - cublasSaxpy(device.get_handle(), *n, alpha, x, *incx, y, *incy); + hipblasSaxpy(device.get_handle(), *n, alpha, x, *incx, y, *incy); expect(success == 0); } return; @@ -429,7 +429,7 @@ void scal(int *n, P *alpha, P *x, int *incx, resource const resrc) if (resrc == resource::device) { // device-specific specialization if needed -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP // no non-fp blas on device expect(std::is_floating_point_v

); @@ -437,13 +437,13 @@ void scal(int *n, P *alpha, P *x, int *incx, resource const resrc) if constexpr (std::is_same::value) { auto const success = - cublasDscal(device.get_handle(), *n, alpha, x, *incx); + hipblasDscal(device.get_handle(), *n, alpha, x, *incx); expect(success == 0); } else if constexpr (std::is_same::value) { auto const success = - cublasSscal(device.get_handle(), *n, alpha, x, *incx); + hipblasSscal(device.get_handle(), *n, alpha, x, *incx); expect(success == 0); } return; @@ -561,7 +561,7 @@ void gemv(char const *trans, int *m, int *n, P *alpha, P *A, int *lda, P *x, if (resrc == resource::device) { // device-specific specialization if needed -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP // no non-fp blas on device expect(std::is_floating_point_v

); @@ -569,15 +569,15 @@ void gemv(char const *trans, int *m, int *n, P *alpha, P *A, int *lda, P *x, if constexpr (std::is_same::value) { auto const success = - cublasDgemv(device.get_handle(), cublas_trans(*trans), *m, *n, alpha, - A, *lda, x, *incx, beta, y, *incy); + hipblasDgemv(device.get_handle(), hipblas_trans(*trans), *m, *n, + alpha, A, *lda, x, *incx, beta, y, *incy); expect(success == 0); } else if constexpr (std::is_same::value) { auto const success = - cublasSgemv(device.get_handle(), cublas_trans(*trans), *m, *n, alpha, - A, *lda, x, *incx, beta, y, *incy); + hipblasSgemv(device.get_handle(), hipblas_trans(*trans), *m, *n, + alpha, A, *lda, x, *incx, beta, y, *incy); expect(success == 0); } return; @@ -627,23 +627,23 @@ void gemm(char const *transa, char const *transb, int *m, int *n, int *k, if (resrc == resource::device) { // device-specific specialization if needed -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP // no non-fp blas on device expect(std::is_floating_point_v

); // instantiated for these two fp types if constexpr (std::is_same::value) { - auto const success = cublasDgemm( - device.get_handle(), cublas_trans(*transa), cublas_trans(*transb), *m, - *n, *k, alpha, A, *lda, B, *ldb, beta, C, *ldc); + auto const success = hipblasDgemm( + device.get_handle(), hipblas_trans(*transa), hipblas_trans(*transb), + *m, *n, *k, alpha, A, *lda, B, *ldb, beta, C, *ldc); expect(success == 0); } else if constexpr (std::is_same::value) { - auto const success = cublasSgemm( - device.get_handle(), cublas_trans(*transa), cublas_trans(*transb), *m, - *n, *k, alpha, A, *lda, B, *ldb, beta, C, *ldc); + auto const success = hipblasSgemm( + device.get_handle(), hipblas_trans(*transa), hipblas_trans(*transb), + *m, *n, *k, alpha, A, *lda, B, *ldb, beta, C, *ldc); expect(success == 0); } return; @@ -686,7 +686,7 @@ void getrf(int *m, int *n, P *A, int *lda, int *ipiv, int *info, if (resrc == resource::device) { // device-specific specialization if needed -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP // no non-fp blas on device expect(std::is_floating_point_v

); @@ -694,22 +694,22 @@ void getrf(int *m, int *n, P *A, int *lda, int *ipiv, int *info, ignore(m); P **A_d; - auto stat = cudaMalloc((void **)&A_d, sizeof(P *)); + auto stat = hipMalloc((void **)&A_d, sizeof(P *)); expect(stat == 0); - stat = cudaMemcpy(A_d, &A, sizeof(P *), cudaMemcpyHostToDevice); + stat = hipMemcpy(A_d, &A, sizeof(P *), hipMemcpyHostToDevice); expect(stat == 0); // instantiated for these two fp types if constexpr (std::is_same::value) { - auto const success = cublasDgetrfBatched(device.get_handle(), *n, A_d, - *lda, ipiv, info, 1); + auto const success = hipblasDgetrfBatched(device.get_handle(), *n, A_d, + *lda, ipiv, info, 1); expect(success == 0); } else if constexpr (std::is_same::value) { - auto const success = cublasSgetrfBatched(device.get_handle(), *n, A_d, - *lda, ipiv, info, 1); + auto const success = hipblasSgetrfBatched(device.get_handle(), *n, A_d, + *lda, ipiv, info, 1); expect(success == 0); } return; @@ -747,7 +747,7 @@ void getri(int *n, P *A, int *lda, int *ipiv, P *work, int *lwork, int *info, if (resrc == resource::device) { // device-specific specialization if needed -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP // no non-fp blas on device expect(std::is_floating_point_v

); @@ -755,28 +755,28 @@ void getri(int *n, P *A, int *lda, int *ipiv, P *work, int *lwork, int *info, expect(*lwork == (*n) * (*n)); ignore(lwork); - P const **A_d; + P **A_d; // hipBlas loses const to DgetriBatched P **work_d; - auto stat = cudaMalloc((void **)&A_d, sizeof(P *)); + auto stat = hipMalloc((void **)&A_d, sizeof(P *)); expect(stat == 0); - stat = cudaMalloc((void **)&work_d, sizeof(P *)); + stat = hipMalloc((void **)&work_d, sizeof(P *)); expect(stat == 0); - stat = cudaMemcpy(A_d, &A, sizeof(P *), cudaMemcpyHostToDevice); + stat = hipMemcpy(A_d, &A, sizeof(P *), hipMemcpyHostToDevice); expect(stat == 0); - stat = cudaMemcpy(work_d, &work, sizeof(P *), cudaMemcpyHostToDevice); + stat = hipMemcpy(work_d, &work, sizeof(P *), hipMemcpyHostToDevice); expect(stat == 0); // instantiated for these two fp types if constexpr (std::is_same::value) { - auto const success = cublasDgetriBatched( + auto const success = hipblasDgetriBatched( device.get_handle(), *n, A_d, *lda, nullptr, work_d, *n, info, 1); expect(success == 0); } else if constexpr (std::is_same::value) { - auto const success = cublasSgetriBatched( + auto const success = hipblasSgetriBatched( device.get_handle(), *n, A_d, *lda, nullptr, work_d, *n, info, 1); expect(success == 0); } @@ -824,7 +824,7 @@ void batched_gemm(P **const &a, int *lda, char const *transa, P **const &b, if (resrc == resource::device) { // device-specific specialization if needed -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP // no non-fp blas on device expect(std::is_floating_point_v

); @@ -833,44 +833,44 @@ void batched_gemm(P **const &a, int *lda, char const *transa, P **const &b, P **c_d; size_t const list_size = *num_batch * sizeof(P *); - auto stat = cudaMalloc((void **)&a_d, list_size); + auto stat = hipMalloc((void **)&a_d, list_size); expect(stat == 0); - stat = cudaMalloc((void **)&b_d, list_size); + stat = hipMalloc((void **)&b_d, list_size); expect(stat == 0); - stat = cudaMalloc((void **)&c_d, list_size); + stat = hipMalloc((void **)&c_d, list_size); expect(stat == 0); - stat = cudaMemcpy(a_d, a, list_size, cudaMemcpyHostToDevice); + stat = hipMemcpy(a_d, a, list_size, hipMemcpyHostToDevice); expect(stat == 0); - stat = cudaMemcpy(b_d, b, list_size, cudaMemcpyHostToDevice); + stat = hipMemcpy(b_d, b, list_size, hipMemcpyHostToDevice); expect(stat == 0); - stat = cudaMemcpy(c_d, c, list_size, cudaMemcpyHostToDevice); + stat = hipMemcpy(c_d, c, list_size, hipMemcpyHostToDevice); expect(stat == 0); // instantiated for these two fp types if constexpr (std::is_same::value) { - auto const success = cublasDgemmBatched( - device.get_handle(), cublas_trans(*transa), cublas_trans(*transb), *m, - *n, *k, alpha, a_d, *lda, b_d, *ldb, beta, c_d, *ldc, *num_batch); - auto const cuda_stat = cudaDeviceSynchronize(); - expect(cuda_stat == 0); + auto const success = hipblasDgemmBatched( + device.get_handle(), hipblas_trans(*transa), hipblas_trans(*transb), + *m, *n, *k, alpha, a_d, *lda, b_d, *ldb, beta, c_d, *ldc, *num_batch); + auto const hip_stat = hipDeviceSynchronize(); + expect(hip_stat == 0); expect(success == 0); } else if constexpr (std::is_same::value) { - auto const success = cublasSgemmBatched( - device.get_handle(), cublas_trans(*transa), cublas_trans(*transb), *m, - *n, *k, alpha, a_d, *lda, b_d, *ldb, beta, c_d, *ldc, *num_batch); - auto const cuda_stat = cudaDeviceSynchronize(); - expect(cuda_stat == 0); + auto const success = hipblasSgemmBatched( + device.get_handle(), hipblas_trans(*transa), hipblas_trans(*transb), + *m, *n, *k, alpha, a_d, *lda, b_d, *ldb, beta, c_d, *ldc, *num_batch); + auto const hip_stat = hipDeviceSynchronize(); + expect(hip_stat == 0); expect(success == 0); } - stat = cudaFree(a_d); + stat = hipFree(a_d); expect(stat == 0); - stat = cudaFree(b_d); + stat = hipFree(b_d); expect(stat == 0); - stat = cudaFree(c_d); + stat = hipFree(c_d); expect(stat == 0); return; @@ -912,7 +912,7 @@ void batched_gemv(P **const &a, int *lda, char const *trans, P **const &x, if (resrc == resource::device) { // device-specific specialization if needed -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP // no non-fp blas on device expect(std::is_floating_point_v

); char const transb = 'n'; @@ -929,46 +929,46 @@ void batched_gemv(P **const &a, int *lda, char const *trans, P **const &x, P **y_d; size_t const list_size = *num_batch * sizeof(P *); - auto stat = cudaMalloc((void **)&a_d, list_size); + auto stat = hipMalloc((void **)&a_d, list_size); expect(stat == 0); - stat = cudaMalloc((void **)&x_d, list_size); + stat = hipMalloc((void **)&x_d, list_size); expect(stat == 0); - stat = cudaMalloc((void **)&y_d, list_size); + stat = hipMalloc((void **)&y_d, list_size); expect(stat == 0); - stat = cudaMemcpy(a_d, a, list_size, cudaMemcpyHostToDevice); + stat = hipMemcpy(a_d, a, list_size, hipMemcpyHostToDevice); expect(stat == 0); - stat = cudaMemcpy(x_d, x, list_size, cudaMemcpyHostToDevice); + stat = hipMemcpy(x_d, x, list_size, hipMemcpyHostToDevice); expect(stat == 0); - stat = cudaMemcpy(y_d, y, list_size, cudaMemcpyHostToDevice); + stat = hipMemcpy(y_d, y, list_size, hipMemcpyHostToDevice); expect(stat == 0); // instantiated for these two fp types if constexpr (std::is_same::value) { - auto const success = cublasDgemmBatched( - device.get_handle(), cublas_trans(*trans), cublas_trans(transb), + auto const success = hipblasDgemmBatched( + device.get_handle(), hipblas_trans(*trans), hipblas_trans(transb), gemm_m, gemm_n, gemm_k, alpha, a_d, *lda, x_d, ldb, beta, y_d, ldc, *num_batch); - auto const cuda_stat = cudaDeviceSynchronize(); - expect(cuda_stat == 0); + auto const hip_stat = hipDeviceSynchronize(); + expect(hip_stat == 0); expect(success == 0); } else if constexpr (std::is_same::value) { - auto const success = cublasSgemmBatched( - device.get_handle(), cublas_trans(*trans), cublas_trans(transb), + auto const success = hipblasSgemmBatched( + device.get_handle(), hipblas_trans(*trans), hipblas_trans(transb), gemm_m, gemm_n, gemm_k, alpha, a_d, *lda, x_d, ldb, beta, y_d, ldc, *num_batch); - auto const cuda_stat = cudaDeviceSynchronize(); - expect(cuda_stat == 0); + auto const hip_stat = hipDeviceSynchronize(); + expect(hip_stat == 0); expect(success == 0); } - stat = cudaFree(a_d); + stat = hipFree(a_d); expect(stat == 0); - stat = cudaFree(x_d); + stat = hipFree(x_d); expect(stat == 0); - stat = cudaFree(y_d); + stat = hipFree(y_d); expect(stat == 0); return; diff --git a/src/lib_dispatch_tests.cpp b/src/lib_dispatch_tests.cpp index 87ea0c9ea..8c0eb3ca9 100644 --- a/src/lib_dispatch_tests.cpp +++ b/src/lib_dispatch_tests.cpp @@ -793,7 +793,7 @@ TEMPLATE_TEST_CASE("dot product (lib_dispatch::dot)", "[lib_dispatch]", float, TEMPLATE_TEST_CASE("device inversion test (lib_dispatch::getrf/getri)", "[lib_dispatch]", float, double) { -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP fk::matrix const test{{0.767135868133925, -0.641484652834663}, {0.641484652834663, 0.767135868133926}}; @@ -810,7 +810,7 @@ TEMPLATE_TEST_CASE("device inversion test (lib_dispatch::getrf/getri)", lib_dispatch::getrf(&m, &n, test_d.data(), &lda, ipiv_d.data(), info_d.data(), resource::device); - auto stat = cudaDeviceSynchronize(); + auto stat = hipDeviceSynchronize(); REQUIRE(stat == 0); fk::vector const info_check(info_d.clone_onto_host()); REQUIRE(info_check(0) == 0); @@ -822,7 +822,7 @@ TEMPLATE_TEST_CASE("device inversion test (lib_dispatch::getrf/getri)", lib_dispatch::getri(&n, test_d.data(), &lda, ipiv_d.data(), work.data(), &size, info_d.data(), resource::device); - stat = cudaDeviceSynchronize(); + stat = hipDeviceSynchronize(); REQUIRE(stat == 0); fk::vector const info_check_2(info_d.clone_onto_host()); REQUIRE(info_check_2(0) == 0); @@ -1179,7 +1179,11 @@ TEMPLATE_TEST_CASE_SIG("batched gemv", "[lib_dispatch]", (double, resource::host), (double, resource::device), (float, resource::host), (float, resource::device)) { - TestType const tol_factor = 1e-18; + TestType tol_factor = 1e-18; + if constexpr (resrc == resource::device) + { + tol_factor = 1e-7; + } SECTION("batched gemv: no trans, alpha = 1.0, beta = 0.0") { diff --git a/src/program_options.cpp b/src/program_options.cpp index 548c17f3d..6663686ef 100644 --- a/src/program_options.cpp +++ b/src/program_options.cpp @@ -211,7 +211,7 @@ parser::parser(int argc, char **argv) } } -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP if (use_implicit_stepping) { std::cerr << "GPU acceleration not implemented for implicit stepping\n"; diff --git a/src/tensors.hpp b/src/tensors.hpp index 90eecd8c7..0fff81f60 100644 --- a/src/tensors.hpp +++ b/src/tensors.hpp @@ -1,8 +1,8 @@ #pragma once #include "build_info.hpp" -#ifdef ASGARD_USE_CUDA -#include +#ifdef ASGARD_USE_HIP +#include #endif #include "lib_dispatch.hpp" @@ -639,9 +639,9 @@ template inline void allocate_device(P *&ptr, int const num_elems, bool const initialize = true) { -#ifdef ASGARD_USE_CUDA - auto success = cudaMalloc((void **)&ptr, num_elems * sizeof(P)); - assert(success == cudaSuccess); +#ifdef ASGARD_USE_HIP + auto success = hipMalloc((void **)&ptr, num_elems * sizeof(P)); + assert(success == hipSuccess); if (num_elems > 0) { expect(ptr != nullptr); @@ -649,8 +649,8 @@ allocate_device(P *&ptr, int const num_elems, bool const initialize = true) if (initialize) { - success = cudaMemset((void *)ptr, 0, num_elems * sizeof(P)); - expect(success == cudaSuccess); + success = hipMemset((void *)ptr, 0, num_elems * sizeof(P)); + expect(success == hipSuccess); } #else @@ -668,12 +668,12 @@ allocate_device(P *&ptr, int const num_elems, bool const initialize = true) template inline void delete_device(P *const ptr) { -#ifdef ASGARD_USE_CUDA - auto const success = cudaFree(ptr); +#ifdef ASGARD_USE_HIP + auto const success = hipFree(ptr); // the device runtime may be unloaded at process shut down // (when static storage duration destructors are called) // returning a cudartUnloading error code. - expect((success == cudaSuccess) || (success == cudaErrorCudartUnloading)); + expect((success == hipSuccess) || (success == hipErrorDeinitialized)); #else delete[] ptr; #endif @@ -683,10 +683,10 @@ template inline void copy_on_device(P *const dest, P const *const source, int const num_elems) { -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP auto const success = - cudaMemcpy(dest, source, num_elems * sizeof(P), cudaMemcpyDeviceToDevice); - expect(success == cudaSuccess); + hipMemcpy(dest, source, num_elems * sizeof(P), hipMemcpyDeviceToDevice); + expect(success == hipSuccess); #else std::copy(source, source + num_elems, dest); #endif @@ -696,10 +696,10 @@ template inline void copy_to_device(P *const dest, P const *const source, int const num_elems) { -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP auto const success = - cudaMemcpy(dest, source, num_elems * sizeof(P), cudaMemcpyHostToDevice); - expect(success == cudaSuccess); + hipMemcpy(dest, source, num_elems * sizeof(P), hipMemcpyHostToDevice); + expect(success == hipSuccess); #else std::copy(source, source + num_elems, dest); #endif @@ -709,10 +709,10 @@ template inline void copy_to_host(P *const dest, P const *const source, int const num_elems) { -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP auto const success = - cudaMemcpy(dest, source, num_elems * sizeof(P), cudaMemcpyDeviceToHost); - expect(success == cudaSuccess); + hipMemcpy(dest, source, num_elems * sizeof(P), hipMemcpyDeviceToHost); + expect(success == hipSuccess); #else std::copy(source, source + num_elems, dest); #endif @@ -726,11 +726,14 @@ copy_matrix_on_device(fk::matrix &dest, expect(source.nrows() == dest.nrows()); expect(source.ncols() == dest.ncols()); -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP + // on AMD, hipMemcpy2D will give throw an error if dpitch or spitch is 0 + if (source.stride() == 0) + return; auto const success = - cudaMemcpy2D(dest.data(), dest.stride() * sizeof(P), source.data(), - source.stride() * sizeof(P), source.nrows() * sizeof(P), - source.ncols(), cudaMemcpyDeviceToDevice); + hipMemcpy2D(dest.data(), dest.stride() * sizeof(P), source.data(), + source.stride() * sizeof(P), source.nrows() * sizeof(P), + source.ncols(), hipMemcpyDeviceToDevice); expect(success == 0); #else std::copy(source.begin(), source.end(), dest.begin()); @@ -745,11 +748,14 @@ copy_matrix_to_device(fk::matrix &dest, { expect(source.nrows() == dest.nrows()); expect(source.ncols() == dest.ncols()); -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP + // on AMD, hipMemcpy2D will give throw an error if dpitch or spitch is 0 + if (source.stride() == 0) + return; auto const success = - cudaMemcpy2D(dest.data(), dest.stride() * sizeof(P), source.data(), - source.stride() * sizeof(P), source.nrows() * sizeof(P), - source.ncols(), cudaMemcpyHostToDevice); + hipMemcpy2D(dest.data(), dest.stride() * sizeof(P), source.data(), + source.stride() * sizeof(P), source.nrows() * sizeof(P), + source.ncols(), hipMemcpyHostToDevice); expect(success == 0); #else std::copy(source.begin(), source.end(), dest.begin()); @@ -764,11 +770,14 @@ copy_matrix_to_host(fk::matrix &dest, { expect(source.nrows() == dest.nrows()); expect(source.ncols() == dest.ncols()); -#ifdef ASGARD_USE_CUDA +#ifdef ASGARD_USE_HIP + // on AMD, hipMemcpy2D will give throw an error if dpitch or spitch is 0 + if (source.stride() == 0) + return; auto const success = - cudaMemcpy2D(dest.data(), dest.stride() * sizeof(P), source.data(), - source.stride() * sizeof(P), source.nrows() * sizeof(P), - source.ncols(), cudaMemcpyDeviceToHost); + hipMemcpy2D(dest.data(), dest.stride() * sizeof(P), source.data(), + source.stride() * sizeof(P), source.nrows() * sizeof(P), + source.ncols(), hipMemcpyDeviceToHost); expect(success == 0); #else std::copy(source.begin(), source.end(), dest.begin());