diff --git a/CMakeLists.txt b/CMakeLists.txt
index d6ea3fee9..ff5d239d5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,4 +1,4 @@
-cmake_minimum_required (VERSION 3.19) 
+cmake_minimum_required (VERSION 3.21)
 
 project (asgard
   VERSION 0.3.0
@@ -12,7 +12,7 @@ include (FetchContent)
 find_package (Git)
 
 #  Define a macro to register new projects.
-function (register_project name dir url default_tag)
+function (register_project name dir url default_tag make_avail)
     set (BUILD_TAG_${dir} ${default_tag} CACHE STRING "Name of the tag to checkout.")
     set (BUILD_REPO_${dir} ${url} CACHE STRING "URL of the repo to clone.")
 
@@ -24,7 +24,9 @@ function (register_project name dir url default_tag)
         SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR}/contrib/${dir}
     )
 
-    FetchContent_MakeAvailable(${name})
+    if (${make_avail})
+      FetchContent_MakeAvailable(${name})
+    endif()
 endfunction ()
 
 #  Changes to the current version of kromult should proceed through a pull
@@ -33,6 +35,7 @@ register_project (kronmult
                   KRONMULT
                   https://github.com/project-asgard/kronmult.git
                   f941819685bbd3026a85145dde286f593683c1f4
+                  OFF
 )
 
 ###############################################################################
@@ -78,7 +81,7 @@ option (ASGARD_PROFILE_PERF "enable profiling support for using linux perf" "")
 option (ASGARD_PROFILE_VALGRIND "enable profiling support for using valgrind" "")
 option (ASGARD_GRAPHVIZ_PATH "optional location of bin/ containing dot executable" "")
 option (ASGARD_IO_HIGHFIVE "Use the HighFive HDF5 header library for I/O" OFF)
-option (ASGARD_USE_CUDA "Optional CUDA support for asgard" OFF)
+option (ASGARD_USE_HIP "Optional HIP support for asgard" OFF)
 option (ASGARD_USE_OPENMP "Optional openMP support for asgard" ON)
 option (ASGARD_USE_MPI "Optional distributed computing support for asgard" OFF)
 include(CMakeDependentOption)
@@ -136,30 +139,173 @@ if(ASGARD_USE_OPENMP)
   endif()
 endif()
 
-if(ASGARD_USE_CUDA)
-  find_package(CUDA 9.0 REQUIRED) # eventually want to remove this - how to set min version with enable_language?
-  include_directories(${CUDA_INCLUDE_DIRS})
-  enable_language(CUDA) 
-  set (CMAKE_CUDA_STANDARD 14)
-  set (CMAKE_CUDA_STANDARD_REQUIRED ON)
-endif()
+# convenience flags for which HIP platform has been setup
+set(ASGARD_PLATFORM_NVCC 0)
+set(ASGARD_PLATFORM_AMD 0)
+if(ASGARD_USE_HIP)
+  # search for HIP and libraries
+  if(NOT DEFINED HIP_PATH)
+    if(NOT DEFINED ENV{HIP_PATH})
+      set(HIP_PATH "/opt/rocm/hip" CACHE PATH "Path to which HIP has been installed")
+    else()
+      set(HIP_PATH $ENV{HIP_PATH} CACHE PATH "Path to which HIP has been installed")
+    endif()
+  endif()
 
-# build component to interface with Ed's kronmult lib
-##TODO: link to kronmult as interface library
-add_library(kronmult_cuda SHARED src/device/kronmult_cuda.cpp)
-if(ASGARD_USE_CUDA)
-  set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES LANGUAGE CUDA ) # no .cu extension
-  set_target_properties( kronmult_cuda PROPERTIES CUDA_ARCHITECTURES OFF)
-  set_target_properties( kronmult_cuda PROPERTIES COMPILE_FLAGS "-arch sm_70 -g -lineinfo --ptxas-options=-O3")
-  set_target_properties( kronmult_cuda PROPERTIES LINK_FLAGS "-Wl,-rpath,${CMAKE_BINARY_DIR}")
+  # set HIP_CLANG_PATH for potential installs in non-standard locations (such as rocm with spack)
+  if (NOT DEFINED HIP_CLANG_PATH)
+    if(NOT DEFINED ENV{HIP_CLANG_PATH})
+      set(HIP_CLANG_PATH "${ROCM_PATH}/llvm/bin" CACHE PATH "Path to HIP clang binaries")
+    else()
+      set(HIP_CLANG_PATH $ENV{HIP_CLANG_PATH} CACHE PATH "Path to HIP clang binaries")
+    endif()
+  endif()
+
+  # note: could probably grab this path directly using hipconfig?
+  if (NOT DEFINED HIP_CLANG_INCLUDE_PATH)
+    if(NOT DEFINED ENV{HIP_CLANG_INCLUDE_PATH})
+      # probably need a better way to get the compiler version.. this will cause non-existent paths for non-clang compilers
+      set(HIP_CLANG_INCLUDE_PATH "${HIP_CLANG_PATH}/../lib/clang/${CMAKE_CXX_COMPILER_VERSION}/include" CACHE PATH "Path to HIP clang include directory")
+    else()
+      set(HIP_CLANG_INCLUDE_PATH $ENV{HIP_CLANG_INCLUDE_PATH} CACHE PATH "Path to HIP clang include directory")
+    endif()
+  endif()
+
+  if(NOT DEFINED HIPBLAS_PATH)
+    if(NOT DEFINED ENV{HIPBLAS_PATH})
+        set(HIPBLAS_PATH "${HIP_PATH}/../hipblas" CACHE PATH "Path to which HIPBLAS has been installed")
+    else()
+        set(HIPBLAS_PATH $ENV{HIPBLAS_PATH} CACHE PATH "Path to which HIPBLAS has been installed")
+    endif()
+  endif()
+
+  # try to find hipconfig executable which can help detect platforms and include dirs
+  find_program(ASGARD_HIPCONFIG_PATH hipconfig HINTS "${HIP_PATH}/bin")
+  if(ASGARD_HIPCONFIG_PATH)
+    execute_process(COMMAND ${ASGARD_HIPCONFIG_PATH} --platform OUTPUT_VARIABLE ASGARD_HIP_PLATFORM)
+  elseif(DEFINED ENV{HIP_PLATFORM})
+    set(ASGARD_HIP_PLATFORM "$ENV{HIP_PLATFORM}")
+  else()
+    message(FATAL_ERROR "Could not determine HIP platform, make sure HIP_PLATFORM is set")
+  endif()
+
+  message(STATUS "HIP platform has been detected as ${ASGARD_HIP_PLATFORM}")
+  # hip >= 4.2 is now using "amd" to identify platform
+  if(ASGARD_HIP_PLATFORM STREQUAL "hcc" OR ASGARD_HIP_PLATFORM STREQUAL "amd")
+    set(ASGARD_PLATFORM_AMD 1)
+  # hip <= 4.1 uses "nvcc" to identify nvidia platforms, >= 4.2 uses "nvidia"
+  elseif(ASGARD_HIP_PLATFORM STREQUAL "nvcc" OR ASGARD_HIP_PLATFORM STREQUAL "nvidia")
+    set(ASGARD_PLATFORM_NVCC 1)
+  endif()
+
+  # double check for cuda path since HIP uses it internally
+  if(ASGARD_PLATFORM_NVCC)
+    if (NOT DEFINED ENV{CUDA_PATH})
+      find_path(ASGARD_HIP_DEFAULT_CUDA_PATH "cuda.h" PATH /usr/local/cuda/include)
+      if (NOT ASGARD_HIP_DEFAULT_CUDA_PATH)
+        message(FATAL_ERROR "Make sure the CUDA_PATH env is set to locate for HIP")
+      endif()
+    endif()
+    message(STATUS "Found CUDA_PATH: $ENV{CUDA_PATH}")
+  endif()
+
+  # look for HIP cmake configs in different locations
+  list(APPEND CMAKE_MODULE_PATH "${HIP_PATH}/cmake" "${ROCM_PATH}")
+  if(ASGARD_PLATFORM_AMD)
+    # note: causes issues on nvidia, but might be needed on amd platforms?
+    list(APPEND CMAKE_PREFIX_PATH "${HIP_PATH}/lib/cmake" "${ROCM_PATH}")
+
+    # output a warning if compiling for AMD without using amd-clang
+    if(NOT CMAKE_CXX_COMPILER_ID MATCHES "Clang")
+      message(WARNING "Compiling HIP for AMD without using AMD clang might not work. Use -DCMAKE_CXX_COMPILER=clang++")
+    endif()
+  endif()
+  list(APPEND CMAKE_PREFIX_PATH "${HIPBLAS_PATH}/lib/cmake")
+
+  set(HIP_VERBOSE_BUILD ON CACHE STRING "Verbose compilation for HIP")
+
+  set(ASGARD_HIP_FLAGS "-std=c++14;-g" CACHE STRING "HIP compiler flags for both AMD and NVIDIA")
+  set(GPU_ARCH "70" CACHE STRING "AMD/NVIDIA GPU architecture number (such as 906 or 70)")
+
+  if(ASGARD_PLATFORM_AMD)
+    set(AMDGPU_TARGETS "gfx${GPU_ARCH}" CACHE STRING "GPU target architectures to compile for" FORCE)
+    set(GPU_TARGETS "gfx${GPU_ARCH}" CACHE STRING "GPU target architectures to compile for" FORCE)
+
+    # need a much later version for AMD since ipiv=nullptr fix not in hipblas until >4.3.1
+    find_package(HIP 4.3.0 REQUIRED)
+    find_package(hipblas 0.49 REQUIRED)
+  else()
+    find_package(HIP 4.0 REQUIRED)
+    find_package(hipblas REQUIRED)
+  endif()
+  # Print some debug info about HIP configuration
+  message(STATUS "HIP PLATFORM: ${HIP_PLATFORM}")
+  message(STATUS "HIP COMPILER: ${HIP_COMPILER}")
+  message(STATUS "HIP RUNTIME: ${HIP_RUNTIME}")
+  message(STATUS "HIP Includes: ${HIP_INCLUDE_DIRS}")
+  message(STATUS "HIP Libraries: ${HIP_LIBRARIES}")
+
+  if(ASGARD_PLATFORM_NVCC)
+    enable_language(CUDA)
+    set (CMAKE_CUDA_STANDARD 14)
+    set (CMAKE_CUDA_STANDARD_REQUIRED ON)
+    add_compile_definitions(__HIP_PLATFORM_NVCC__ __HIP_PLATFORM_NVIDIA__)
+    set(ASGARD_NVCC_FLAGS "${ASGARD_HIP_FLAGS}; -gencode arch=compute_${GPU_ARCH},code=compute_${GPU_ARCH} --ptxas-options=-O3 -lineinfo" CACHE STRING "Flags to pass to NVCC" FORCE) # nvcc specific options
+  elseif(ASGARD_PLATFORM_AMD)
+    #enable_language(HIP) # not yet added to latest cmake, but should be available in 3.21
+    # these compile definitions should be added automatically if using amd's clang, but
+    # may not necessarily be added if compiling with gcc or others
+    add_compile_definitions(__HIP_PLATFORM_HCC__ __HIP_PLATFORM_AMD__)
+    set(ASGARD_AMD_FLAGS "${ASGARD_HIP_FLAGS}; --amdgpu-target=gfx${GPU_ARCH};-O3" CACHE STRING "Flags to pass to amd-clang for HIP" FORCE) # amdgpu specific options
+  endif()
+
+  if (hipBLAS_FOUND)
+     message(STATUS "Found rocBLAS version ${rocBLAS_VERSION}: ${HIPBLAS_INCLUDE_DIRS}")
+  endif()
+
+  # set source file language properties
+  if(ASGARD_PLATFORM_AMD)
+    #set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES LANGUAGE HIP ) # should work after cmake 3.21 release?
+    set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
+  elseif(ASGARD_PLATFORM_NVCC)
+    set_source_files_properties( src/device/kronmult_cuda.cpp PROPERTIES LANGUAGE CUDA ) # no .cu extension
+  endif()
 
 #  Turn on GPU support in kronmult.
   set (USE_GPU ON CACHE BOOL "Turn on kronmult gpu support" FORCE)
 endif()
 
+# Fetch kronmult after configuring everything, but before adding libraries
+FetchContent_MakeAvailable(kronmult)
+
+if(ASGARD_USE_HIP)
+  set(CMAKE_HIP_ARCHITECTURES OFF)
+  set(HIP_HIPCC_FLAGS "${HIP_HIPCC_FLAGS} ${ASGARD_HIP_FLAGS}" CACHE STRING "")
+  set(HIP_CLANG_FLAGS "${HIP_CLANG_FLAGS} ${ASGARD_AMD_FLAGS}" CACHE STRING "")
+  set(HIP_NVCC_FLAGS "${HIP_NVCC_FLAGS} ${ASGARD_NVCC_FLAGS}" CACHE STRING "")
+
+  hip_add_library(kronmult_cuda SHARED src/device/kronmult_cuda.cpp
+                  HIPCC_OPTIONS "${ASGARD_HIP_FLAGS}"
+                  NVCC_OPTIONS "${ASGARD_NVCC_FLAGS}"
+                  CLANG_OPTIONS "${ASGARD_AMD_FLAGS}")
+  target_include_directories(kronmult_cuda PUBLIC ${HIP_INCLUDE_DIRS})
+  # assume this include path since HIP_INCLUDE_DIRS is not being set on nvidia platform
+  target_include_directories(kronmult_cuda PUBLIC "${HIP_PATH}/include")
+  target_include_directories(kronmult_cuda PUBLIC ${HIPBLAS_INCLUDE_DIRS})
+  if(ASGARD_PLATFORM_NVCC)
+    set_target_properties( kronmult_cuda PROPERTIES CUDA_ARCHITECTURES OFF)
+    set_target_properties( kronmult_cuda PROPERTIES COMPILE_FLAGS "-arch sm_${GPU_ARCH} -g -lineinfo --ptxas-options=-O3")
+  endif()
+  set_target_properties( kronmult_cuda PROPERTIES LINK_FLAGS "-Wl,-rpath,${CMAKE_BINARY_DIR}")
+else()
+  # build component to interface with Ed's kronmult lib
+  ##TODO: link to kronmult as interface library
+  add_library(kronmult_cuda SHARED src/device/kronmult_cuda.cpp)
+endif()
+
 if(ASGARD_USE_MKL)
-  if(ASGARD_USE_CUDA)
-    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --compiler-options -fopenmp")
+  if(ASGARD_USE_HIP AND ASGARD_PLATFORM_NVCC)
+      set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --compiler-options -fopenmp")
   else()
     target_compile_options (kronmult_cuda PRIVATE "-fopenmp") # CMAKE doesn't handle MKL openmp link properly
     if(APPLE) # Need to link against the same openmp library as the MKL.
@@ -228,7 +374,22 @@ if (ASGARD_USE_MATLAB)
 endif ()
 
 foreach (component IN LISTS components)
-  add_library (${component} src/${component}.cpp)
+  if (ASGARD_USE_HIP)
+    hip_add_library (${component} src/${component}.cpp
+                     HIPCC_OPTIONS "${ASGARD_HIP_FLAGS}"
+                     NVCC_OPTIONS "${ASGARD_NVCC_FLAGS} -lhipblas"
+                     CLANG_OPTIONS "${ASGARD_AMD_FLAGS}")
+    set_target_properties(${component} PROPERTIES LINKER_LANGUAGE HIP)
+    target_include_directories(${component} SYSTEM PUBLIC ${HIP_INCLUDE_DIRS})
+    target_include_directories(${component} SYSTEM PUBLIC "${HIP_PATH}/include")
+    target_include_directories(${component} SYSTEM PUBLIC ${HIPBLAS_INCLUDE_DIRS})
+    if(ASGARD_PLATFORM_NVCC)
+      target_include_directories(${component} SYSTEM PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+    endif()
+    target_link_directories(${component} PRIVATE ${HIPBLAS_PATH}/lib/) # roc::hipblas target isn't linking properly on nvidia platforms
+  else()
+    add_library (${component} src/${component}.cpp)
+  endif()
   target_include_directories (${component} PRIVATE ${CMAKE_BINARY_DIR})
   if(ASGARD_USE_MKL)
     target_compile_options (${component} PRIVATE "-fopenmp") # CMAKE doesn't handle MKL openmp link properly
@@ -239,9 +400,7 @@ if (build_hdf5)
   add_dependencies (io hdf5-ext)
 endif ()
 
-if (build_kron)
-  add_dependencies (kronmult_cuda kronmult-ext)
-endif ()
+add_dependencies (kronmult_cuda kron)
 
 if (ASGARD_USE_SCALAPACK)
   target_link_libraries (tensors PRIVATE scalapack_matrix_info cblacs_grid)
@@ -281,7 +440,11 @@ if (ASGARD_USE_OPENMP AND NOT ASGARD_USE_MKL)
    target_link_libraries(kronmult PRIVATE OpenMP::OpenMP_CXX)
 endif ()
 
-target_link_libraries(kronmult_cuda PUBLIC kron)
+if (ASGARD_USE_HIP AND ASGARD_PLATFORM_AMD)
+  target_link_libraries(kronmult_cuda PUBLIC kron hip::device)
+else ()
+  target_link_libraries(kronmult_cuda PRIVATE kron)
+endif()
 
 if (ASGARD_USE_OPENMP AND NOT ASGARD_USE_MKL)
    target_link_libraries(kronmult_cuda PRIVATE OpenMP::OpenMP_CXX)
@@ -302,9 +465,11 @@ else ()
   target_link_libraries (lib_dispatch PRIVATE LINALG::LINALG)
 endif ()
 
-if (ASGARD_USE_CUDA)
-  target_link_libraries(lib_dispatch PRIVATE ${CUDA_LIBRARIES}
-					     ${CUDA_CUBLAS_LIBRARIES})
+if (ASGARD_USE_HIP)
+  if(ASGARD_PLATFORM_AMD)
+    target_link_libraries(lib_dispatch PRIVATE hip::device)
+    target_link_libraries(lib_dispatch PRIVATE roc::hipblas)
+  endif()
 endif()
 
 if (ASGARD_USE_OPENMP AND NOT ASGARD_USE_MKL)
@@ -330,8 +495,11 @@ target_link_libraries (quadrature PRIVATE matlab_utilities tensors)
 target_link_libraries (solver PRIVATE distribution fast_math lib_dispatch tensors)
 
 target_link_libraries (tensors PRIVATE lib_dispatch)
-if (ASGARD_USE_CUDA)
-  target_link_libraries (tensors PRIVATE ${CUDA_LIBRARIES})
+if (ASGARD_USE_HIP)
+  if(ASGARD_PLATFORM_AMD)
+    target_link_libraries(tensors PRIVATE hip::device)
+    target_link_libraries (tensors PRIVATE roc::hipblas)
+  endif()
 endif ()
 if (ASGARD_USE_SCALAPACK)	
   add_compile_definitions (ASGARD_USE_SCALAPACK)
@@ -345,7 +513,16 @@ target_link_libraries (transformations
   quadrature tensors)
 
 # define the main application and its linking
-add_executable (asgard src/main.cpp)
+if (ASGARD_USE_HIP)
+  hip_add_executable (asgard src/main.cpp
+                      HIPCC_OPTIONS "${ASGARD_HIP_FLAGS}"
+                      NVCC_OPTIONS "${ASGARD_NVCC_FLAGS} -lhipblas"
+                      CLANG_OPTIONS "${ASGARD_AMD_FLAGS}")
+  set_target_properties(asgard PROPERTIES LINKER_LANGUAGE HIP)
+  target_link_directories(asgard PRIVATE ${HIPBLAS_PATH}/lib/)
+else()
+  add_executable (asgard src/main.cpp)
+endif()
 
 # link in components needed directly by main
 set (main_app_link_deps
@@ -400,7 +577,21 @@ if (ASGARD_BUILD_TESTS)
   enable_testing ()
 
   # Define ctest tests and their executables
-  add_library (tests_general testing/tests_general.cpp)
+  if (ASGARD_USE_HIP)
+    hip_add_library (tests_general testing/tests_general.cpp
+                     HIPCC_OPTIONS "${ASGARD_HIP_FLAGS}"
+                     NVCC_OPTIONS "${ASGARD_NVCC_FLAGS}"
+                     CLANG_OPTIONS "${ASGARD_AMD_FLAGS}")
+    target_include_directories(tests_general SYSTEM PUBLIC ${HIP_INCLUDE_DIRS})
+    target_include_directories(tests_general SYSTEM PUBLIC "${HIP_PATH}/include")
+    target_include_directories(tests_general SYSTEM PUBLIC ${HIPBLAS_INCLUDE_DIRS})
+    if (ASGARD_PLATFORM_NVCC)
+      target_include_directories(tests_general SYSTEM PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+      target_link_libraries(tests_general PRIVATE ${CMAKE_CUDA_RUNTIME_LIBRARY})
+    endif()
+  else()
+    add_library (tests_general testing/tests_general.cpp)
+  endif()
   target_link_libraries (tests_general PUBLIC Catch PRIVATE pde program_options )
   target_include_directories(tests_general PRIVATE ${CMAKE_BINARY_DIR})
   
@@ -416,7 +607,22 @@ if (ASGARD_BUILD_TESTS)
   endif()
 
   foreach (component IN LISTS components)
-    add_executable (${component}-tests src/${component}_tests.cpp)
+    if (ASGARD_USE_HIP)
+      hip_add_executable (${component}-tests src/${component}_tests.cpp
+                          HIPCC_OPTIONS "${ASGARD_HIP_FLAGS}"
+                          NVCC_OPTIONS "${ASGARD_NVCC_FLAGS} -lhipblas"
+                          CLANG_OPTIONS "${ASGARD_AMD_FLAGS}")
+      if (ASGARD_PLATFORM_NVCC)
+        target_include_directories(${component}-tests SYSTEM PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+        target_link_libraries(${component}-tests PRIVATE ${CMAKE_CUDA_RUNTIME_LIBRARY})
+        target_link_directories(${component}-tests PRIVATE ${HIPBLAS_PATH}/lib/)
+      endif()
+      target_include_directories(${component}-tests SYSTEM PUBLIC ${HIP_INCLUDE_DIRS})
+      target_include_directories(${component}-tests SYSTEM PUBLIC "${HIP_PATH}/include")
+      target_include_directories(${component}-tests SYSTEM PUBLIC ${HIPBLAS_INCLUDE_DIRS})
+    else()
+      add_executable (${component}-tests src/${component}_tests.cpp)
+    endif()
     target_include_directories (${component}-tests PRIVATE ${CMAKE_SOURCE_DIR}/testing)
     target_include_directories (${component}-tests PRIVATE ${CMAKE_BINARY_DIR})
     target_link_libraries (${component}-tests PRIVATE ${component} tests_general)
@@ -432,7 +638,7 @@ if (ASGARD_BUILD_TESTS)
       target_link_libraries (${component}-tests PRIVATE ${component} MPI::MPI_CXX)
       if (${component} IN_LIST mpi_test_components)
         set(test_ranks "4")
-      	if (ASGARD_USE_CUDA)
+        if (ASGARD_USE_HIP)
           set(test_ranks "1")
         endif ()
         if (${ASGARD_TESTING_RANKS})
@@ -450,7 +656,22 @@ if (ASGARD_BUILD_TESTS)
               WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR} )
   endforeach ()
 
-  add_executable (kronmult_cuda-tests src/device/kronmult_cuda_tests.cpp)
+  if (ASGARD_USE_HIP)
+    hip_add_executable (kronmult_cuda-tests src/device/kronmult_cuda_tests.cpp
+                        HIPCC_OPTIONS "${ASGARD_HIP_FLAGS}"
+                        NVCC_OPTIONS "${ASGARD_NVCC_FLAGS} -lhipblas"
+                        CLANG_OPTIONS "${ASGARD_AMD_FLAGS}")
+    if (ASGARD_PLATFORM_NVCC)
+      target_include_directories(kronmult_cuda-tests SYSTEM PUBLIC ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES})
+      target_link_libraries(kronmult_cuda-tests PRIVATE ${CMAKE_CUDA_RUNTIME_LIBRARY})
+      target_link_directories(kronmult_cuda-tests PRIVATE ${HIPBLAS_PATH}/lib/)
+    endif()
+    target_include_directories(kronmult_cuda-tests SYSTEM PUBLIC ${HIP_INCLUDE_DIRS})
+    target_include_directories(kronmult_cuda-tests SYSTEM PUBLIC "${HIP_PATH}/include")
+    target_include_directories(kronmult_cuda-tests SYSTEM PUBLIC ${HIPBLAS_INCLUDE_DIRS})
+  else()
+    add_executable (kronmult_cuda-tests src/device/kronmult_cuda_tests.cpp)
+  endif()
   target_include_directories (kronmult_cuda-tests PRIVATE ${CMAKE_SOURCE_DIR}/testing)
   target_include_directories (kronmult_cuda-tests PRIVATE ${CMAKE_BINARY_DIR})
   target_link_libraries (kronmult_cuda-tests PRIVATE coefficients kronmult_cuda tests_general)
diff --git a/contrib/FindLINALG.cmake b/contrib/FindLINALG.cmake
index b92d94607..7f1f06af4 100644
--- a/contrib/FindLINALG.cmake
+++ b/contrib/FindLINALG.cmake
@@ -54,6 +54,7 @@ if (${ASGARD_BUILD_OPENBLAS})
                       OPENBLAS
                       https://github.com/xianyi/OpenBLAS.git
                       v0.3.18
+                      ON
     )
 
 #  Fetch content does not run the install phase so the headers for openblas are
diff --git a/src/batch_tests.cpp b/src/batch_tests.cpp
index 757b3d4e8..4739b0f79 100644
--- a/src/batch_tests.cpp
+++ b/src/batch_tests.cpp
@@ -864,7 +864,11 @@ void test_batched_gemv(int const m, int const n, int const lda,
 
   batched_gemv(a_batch, x_batch, y_batch, alpha, beta);
 
-  P const tol_factor = 1e-17;
+  P tol_factor = 1e-17;
+  if constexpr (resrc == resource::device)
+  {
+    tol_factor = 1e-7;
+  }
   for (int i = 0; i < num_batch; ++i)
   {
     if constexpr (resrc == resource::host)
diff --git a/src/build_info.hpp.in b/src/build_info.hpp.in
index 089a874cd..019abe0b7 100644
--- a/src/build_info.hpp.in
+++ b/src/build_info.hpp.in
@@ -6,7 +6,7 @@
 #define BUILD_TIME "@BUILD_TIME@"
 
 #cmakedefine ASGARD_IO_HIGHFIVE
-#cmakedefine ASGARD_USE_CUDA
+#cmakedefine ASGARD_USE_HIP
 #cmakedefine ASGARD_USE_OPENMP
 #cmakedefine ASGARD_USE_MPI
 #cmakedefine ASGARD_USE_MATLAB
diff --git a/src/device/kronmult_cuda.cpp b/src/device/kronmult_cuda.cpp
index 9cc8a9d85..fe6573a2c 100644
--- a/src/device/kronmult_cuda.cpp
+++ b/src/device/kronmult_cuda.cpp
@@ -1,9 +1,8 @@
 #include "kronmult_cuda.hpp"
 #include "build_info.hpp"
 
-#ifdef ASGARD_USE_CUDA
-#include <cuda.h>
-#include <cuda_runtime.h>
+#ifdef ASGARD_USE_HIP
+#include <hip/hip_runtime.h>
 #define USE_GPU
 #define GLOBAL_FUNCTION __global__
 #define SYNCTHREADS __syncthreads()
@@ -47,7 +46,7 @@ GLOBAL_FUNCTION void
 stage_inputs_kronmult_kernel(P const *const x, P *const workspace,
                              int const num_elems, int const num_copies)
 {
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
 
   expect(blockIdx.y == 0);
   expect(blockIdx.z == 0);
@@ -90,7 +89,7 @@ void stage_inputs_kronmult(P const *const x, P *const workspace,
   expect(num_elems > 0);
   expect(num_copies > 0);
 
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
 
   auto constexpr warp_size   = 32;
   auto constexpr num_warps   = 8;
@@ -99,11 +98,12 @@ void stage_inputs_kronmult(P const *const x, P *const workspace,
   auto const total_copies = static_cast<int64_t>(num_elems) * num_copies;
   auto const num_blocks   = (total_copies + num_threads - 1) / num_threads;
 
-  stage_inputs_kronmult_kernel<P>
-      <<<num_blocks, num_threads>>>(x, workspace, num_elems, num_copies);
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(stage_inputs_kronmult_kernel<P>),
+                     dim3(num_blocks), dim3(num_threads), 0, 0, x, workspace,
+                     num_elems, num_copies);
 
-  auto const stat = cudaDeviceSynchronize();
-  expect(stat == cudaSuccess);
+  auto const stat = hipDeviceSynchronize();
+  expect(stat == hipSuccess);
 #else
   stage_inputs_kronmult_kernel(x, workspace, num_elems, num_copies);
 #endif
@@ -175,7 +175,7 @@ prepare_kronmult_kernel(int const *const flattened_table,
   auto const coord_size = num_dims * 2;
   auto const num_elems  = static_cast<int64_t>(num_cols) * num_rows;
 
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
 
   expect(blockIdx.y == 0);
   expect(blockIdx.z == 0);
@@ -192,7 +192,7 @@ prepare_kronmult_kernel(int const *const flattened_table,
   auto const increment = 1;
 #endif
 
-#ifndef ASGARD_USE_CUDA
+#ifndef ASGARD_USE_HIP
 #ifdef ASGARD_USE_OPENMP
 #pragma omp parallel for
 #endif
@@ -273,7 +273,7 @@ void prepare_kronmult(int const *const flattened_table,
   expect(input_ptrs);
   expect(output_ptrs);
 
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
   auto constexpr warp_size   = 32;
   auto constexpr num_warps   = 8;
   auto constexpr num_threads = num_warps * warp_size;
@@ -281,12 +281,14 @@ void prepare_kronmult(int const *const flattened_table,
       static_cast<int64_t>(elem_col_stop - elem_col_start + 1) *
       (elem_row_stop - elem_row_start + 1);
   auto const num_blocks = (num_krons / num_threads) + 1;
-  prepare_kronmult_kernel<P><<<num_blocks, num_threads>>>(
-      flattened_table, operators, operator_lda, element_x, element_work, fx,
-      operator_ptrs, work_ptrs, input_ptrs, output_ptrs, degree, num_terms,
-      num_dims, elem_row_start, elem_row_stop, elem_col_start, elem_col_stop);
-  auto const stat = cudaDeviceSynchronize();
-  expect(stat == cudaSuccess);
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(prepare_kronmult_kernel<P>),
+                     dim3(num_blocks), dim3(num_threads), 0, 0, flattened_table,
+                     operators, operator_lda, element_x, element_work, fx,
+                     operator_ptrs, work_ptrs, input_ptrs, output_ptrs, degree,
+                     num_terms, num_dims, elem_row_start, elem_row_stop,
+                     elem_col_start, elem_col_stop);
+  auto const stat = hipDeviceSynchronize();
+  expect(stat == hipSuccess);
 #else
   prepare_kronmult_kernel(
       flattened_table, operators, operator_lda, element_x, element_work, fx,
@@ -304,7 +306,7 @@ void call_kronmult(int const n, P *x_ptrs[], P *output_ptrs[], P *work_ptrs[],
                    P const *const operator_ptrs[], int const lda,
                    int const num_krons, int const num_dims)
 {
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
   {
     int constexpr warpsize    = 32;
     int constexpr nwarps      = 1;
@@ -313,28 +315,40 @@ void call_kronmult(int const n, P *x_ptrs[], P *output_ptrs[], P *work_ptrs[],
     switch (num_dims)
     {
     case 1:
-      kronmult1_xbatched<P><<<num_krons, num_threads>>>(
-          n, operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, num_krons);
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult1_xbatched<P>),
+                         dim3(num_krons), dim3(num_threads), 0, 0, n,
+                         operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs,
+                         num_krons);
       break;
     case 2:
-      kronmult2_xbatched<P><<<num_krons, num_threads>>>(
-          n, operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, num_krons);
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult2_xbatched<P>),
+                         dim3(num_krons), dim3(num_threads), 0, 0, n,
+                         operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs,
+                         num_krons);
       break;
     case 3:
-      kronmult3_xbatched<P><<<num_krons, num_threads>>>(
-          n, operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, num_krons);
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult3_xbatched<P>),
+                         dim3(num_krons), dim3(num_threads), 0, 0, n,
+                         operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs,
+                         num_krons);
       break;
     case 4:
-      kronmult4_xbatched<P><<<num_krons, num_threads>>>(
-          n, operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, num_krons);
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult4_xbatched<P>),
+                         dim3(num_krons), dim3(num_threads), 0, 0, n,
+                         operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs,
+                         num_krons);
       break;
     case 5:
-      kronmult5_xbatched<P><<<num_krons, num_threads>>>(
-          n, operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, num_krons);
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult5_xbatched<P>),
+                         dim3(num_krons), dim3(num_threads), 0, 0, n,
+                         operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs,
+                         num_krons);
       break;
     case 6:
-      kronmult6_xbatched<P><<<num_krons, num_threads>>>(
-          n, operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs, num_krons);
+      hipLaunchKernelGGL(HIP_KERNEL_NAME(kronmult6_xbatched<P>),
+                         dim3(num_krons), dim3(num_threads), 0, 0, n,
+                         operator_ptrs, lda, x_ptrs, output_ptrs, work_ptrs,
+                         num_krons);
       break;
     default:
       expect(false);
@@ -343,8 +357,8 @@ void call_kronmult(int const n, P *x_ptrs[], P *output_ptrs[], P *work_ptrs[],
     // -------------------------------------------
     // note important to wait for kernel to finish
     // -------------------------------------------
-    auto const stat = cudaDeviceSynchronize();
-    expect(stat == cudaSuccess);
+    auto const stat = hipDeviceSynchronize();
+    expect(stat == hipSuccess);
   }
 #else
 
diff --git a/src/kronmult.cpp b/src/kronmult.cpp
index 44d8f0098..a040c28f4 100644
--- a/src/kronmult.cpp
+++ b/src/kronmult.cpp
@@ -3,8 +3,8 @@
 #include "lib_dispatch.hpp"
 #include "tools.hpp"
 
-#ifdef ASGARD_USE_CUDA
-#include <cuda_runtime.h>
+#ifdef ASGARD_USE_HIP
+#include <hip/hip_runtime.h>
 #endif
 
 #ifdef ASGARD_USE_OPENMP
diff --git a/src/kronmult.hpp b/src/kronmult.hpp
index 47e2f46f3..ad6cdbbf7 100644
--- a/src/kronmult.hpp
+++ b/src/kronmult.hpp
@@ -1,5 +1,5 @@
 #pragma once
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
 #define USE_GPU
 #endif
 #include "distribution.hpp"
diff --git a/src/lib_dispatch.cpp b/src/lib_dispatch.cpp
index 15277a3b0..b94ee451a 100644
--- a/src/lib_dispatch.cpp
+++ b/src/lib_dispatch.cpp
@@ -60,9 +60,9 @@ extern "C"
 #pragma GCC diagnostic pop
 #endif
 
-#ifdef ASGARD_USE_CUDA
-#include <cublas_v2.h>
-#include <cuda_runtime.h>
+#ifdef ASGARD_USE_HIP
+#include <hip/hip_runtime.h>
+#include <hipblas.h>
 #endif
 
 #ifdef ASGARD_USE_SCALAPACK
@@ -93,61 +93,61 @@ struct device_handler
 {
   device_handler()
   {
-#ifdef ASGARD_USE_CUDA
-    auto success = cublasCreate(&handle);
-    expect(success == CUBLAS_STATUS_SUCCESS);
+#ifdef ASGARD_USE_HIP
+    auto success = hipblasCreate(&handle);
+    expect(success == HIPBLAS_STATUS_SUCCESS);
 
-    success = cublasSetPointerMode(handle, CUBLAS_POINTER_MODE_HOST);
-    expect(success == CUBLAS_STATUS_SUCCESS);
+    success = hipblasSetPointerMode(handle, HIPBLAS_POINTER_MODE_HOST);
+    expect(success == HIPBLAS_STATUS_SUCCESS);
 #endif
   }
 
   void set_device(int const local_rank)
   {
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
     int num_devices;
-    auto success = cudaGetDeviceCount(&num_devices);
+    auto success = hipGetDeviceCount(&num_devices);
 
-    expect(success == cudaSuccess);
+    expect(success == hipSuccess);
     expect(local_rank >= 0);
     expect(local_rank < num_devices);
 
     if (handle)
     {
-      auto const cublas_success = cublasDestroy(handle);
-      expect(cublas_success == CUBLAS_STATUS_SUCCESS);
+      auto const hipblas_success = hipblasDestroy(handle);
+      expect(hipblas_success == HIPBLAS_STATUS_SUCCESS);
     }
 
-    success = cudaSetDevice(local_rank);
-    expect(success == cudaSuccess);
-    auto const cublas_success = cublasCreate(&handle);
-    expect(cublas_success == CUBLAS_STATUS_SUCCESS);
+    success = hipSetDevice(local_rank);
+    expect(success == hipSuccess);
+    auto const hipblas_success = hipblasCreate(&handle);
+    expect(hipblas_success == HIPBLAS_STATUS_SUCCESS);
 
 #else
     ignore(local_rank);
 #endif
   }
 
-#ifdef ASGARD_USE_CUDA
-  cublasHandle_t const &get_handle() const { return handle; }
+#ifdef ASGARD_USE_HIP
+  hipblasHandle_t const &get_handle() const { return handle; }
 #endif
   ~device_handler()
   {
-#ifdef ASGARD_USE_CUDA
-    cublasDestroy(handle);
+#ifdef ASGARD_USE_HIP
+    hipblasDestroy(handle);
 #endif
   }
 
 private:
-#ifdef ASGARD_USE_CUDA
-  cublasHandle_t handle;
+#ifdef ASGARD_USE_HIP
+  hipblasHandle_t handle;
 #endif
 };
 static device_handler device;
 
 void initialize_libraries(int const local_rank)
 {
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
   expect(local_rank >= 0);
   device.set_device(local_rank);
 #else
@@ -155,16 +155,16 @@ void initialize_libraries(int const local_rank)
 #endif
 }
 
-#ifdef ASGARD_USE_CUDA
-inline cublasOperation_t cublas_trans(char trans)
+#ifdef ASGARD_USE_HIP
+inline hipblasOperation_t hipblas_trans(char trans)
 {
   if (trans == 'N' || trans == 'n')
   {
-    return CUBLAS_OP_N;
+    return HIPBLAS_OP_N;
   }
   else
   {
-    return CUBLAS_OP_T;
+    return HIPBLAS_OP_T;
   }
 }
 #endif
@@ -182,16 +182,16 @@ void rotg(P *a, P *b, P *c, P *s, resource const resrc)
   if (resrc == resource::device)
   {
     // device-specific specialization if needed
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
     // function instantiated for these two fp types
     if constexpr (std::is_same<P, double>::value)
     {
-      auto const success = cublasDrotg(device.get_handle(), a, b, c, s);
+      auto const success = hipblasDrotg(device.get_handle(), a, b, c, s);
       expect(success == 0);
     }
     else if constexpr (std::is_same<P, float>::value)
     {
-      auto const success = cublasSrotg(device.get_handle(), a, b, c, s);
+      auto const success = hipblasSrotg(device.get_handle(), a, b, c, s);
       expect(success == 0);
     }
     return;
@@ -219,7 +219,7 @@ P nrm2(int *n, P *x, int *incx, resource const resrc)
   if (resrc == resource::device)
   {
     // device-specific specialization if needed
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
     // no non-fp blas on device
     expect(std::is_floating_point_v<P>);
     P norm;
@@ -227,13 +227,13 @@ P nrm2(int *n, P *x, int *incx, resource const resrc)
     if constexpr (std::is_same<P, double>::value)
     {
       auto const success =
-          cublasDnrm2(device.get_handle(), *n, x, *incx, &norm);
+          hipblasDnrm2(device.get_handle(), *n, x, *incx, &norm);
       expect(success == 0);
     }
     else if constexpr (std::is_same<P, float>::value)
     {
       auto const success =
-          cublasSnrm2(device.get_handle(), *n, x, *incx, &norm);
+          hipblasSnrm2(device.get_handle(), *n, x, *incx, &norm);
       expect(success == 0);
     }
     return norm;
@@ -272,7 +272,7 @@ void copy(int *n, P *x, int *incx, P *y, int *incy, resource const resrc)
   if (resrc == resource::device)
   {
     // device-specific specialization if needed
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
     // no non-fp blas on device
     expect(std::is_floating_point_v<P>);
 
@@ -280,13 +280,13 @@ void copy(int *n, P *x, int *incx, P *y, int *incy, resource const resrc)
     if constexpr (std::is_same<P, double>::value)
     {
       auto const success =
-          cublasDcopy(device.get_handle(), *n, x, *incx, y, *incy);
+          hipblasDcopy(device.get_handle(), *n, x, *incx, y, *incy);
       expect(success == 0);
     }
     else if constexpr (std::is_same<P, float>::value)
     {
       auto const success =
-          cublasScopy(device.get_handle(), *n, x, *incx, y, *incy);
+          hipblasScopy(device.get_handle(), *n, x, *incx, y, *incy);
       expect(success == 0);
     }
     return;
@@ -323,7 +323,7 @@ P dot(int *n, P *x, int *incx, P *y, int *incy, resource const resrc)
   if (resrc == resource::device)
   {
     // device-specific specialization if needed
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
     // no non-fp blas on device
     expect(std::is_floating_point_v<P>);
 
@@ -332,13 +332,13 @@ P dot(int *n, P *x, int *incx, P *y, int *incy, resource const resrc)
     if constexpr (std::is_same<P, double>::value)
     {
       auto const success =
-          cublasDdot(device.get_handle(), *n, x, *incx, y, *incy, &result);
+          hipblasDdot(device.get_handle(), *n, x, *incx, y, *incy, &result);
       expect(success == 0);
     }
     else if constexpr (std::is_same<P, float>::value)
     {
       auto const success =
-          cublasSdot(device.get_handle(), *n, x, *incx, y, *incy, &result);
+          hipblasSdot(device.get_handle(), *n, x, *incx, y, *incy, &result);
       expect(success == 0);
     }
     return result;
@@ -379,7 +379,7 @@ void axpy(int *n, P *alpha, P *x, int *incx, P *y, int *incy,
   if (resrc == resource::device)
   {
     // device-specific specialization if needed
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
     // no non-fp blas on device
     expect(std::is_floating_point_v<P>);
 
@@ -387,13 +387,13 @@ void axpy(int *n, P *alpha, P *x, int *incx, P *y, int *incy,
     if constexpr (std::is_same<P, double>::value)
     {
       auto const success =
-          cublasDaxpy(device.get_handle(), *n, alpha, x, *incx, y, *incy);
+          hipblasDaxpy(device.get_handle(), *n, alpha, x, *incx, y, *incy);
       expect(success == 0);
     }
     else if constexpr (std::is_same<P, float>::value)
     {
       auto const success =
-          cublasSaxpy(device.get_handle(), *n, alpha, x, *incx, y, *incy);
+          hipblasSaxpy(device.get_handle(), *n, alpha, x, *incx, y, *incy);
       expect(success == 0);
     }
     return;
@@ -429,7 +429,7 @@ void scal(int *n, P *alpha, P *x, int *incx, resource const resrc)
   if (resrc == resource::device)
   {
     // device-specific specialization if needed
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
     // no non-fp blas on device
     expect(std::is_floating_point_v<P>);
 
@@ -437,13 +437,13 @@ void scal(int *n, P *alpha, P *x, int *incx, resource const resrc)
     if constexpr (std::is_same<P, double>::value)
     {
       auto const success =
-          cublasDscal(device.get_handle(), *n, alpha, x, *incx);
+          hipblasDscal(device.get_handle(), *n, alpha, x, *incx);
       expect(success == 0);
     }
     else if constexpr (std::is_same<P, float>::value)
     {
       auto const success =
-          cublasSscal(device.get_handle(), *n, alpha, x, *incx);
+          hipblasSscal(device.get_handle(), *n, alpha, x, *incx);
       expect(success == 0);
     }
     return;
@@ -561,7 +561,7 @@ void gemv(char const *trans, int *m, int *n, P *alpha, P *A, int *lda, P *x,
   if (resrc == resource::device)
   {
     // device-specific specialization if needed
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
     // no non-fp blas on device
     expect(std::is_floating_point_v<P>);
 
@@ -569,15 +569,15 @@ void gemv(char const *trans, int *m, int *n, P *alpha, P *A, int *lda, P *x,
     if constexpr (std::is_same<P, double>::value)
     {
       auto const success =
-          cublasDgemv(device.get_handle(), cublas_trans(*trans), *m, *n, alpha,
-                      A, *lda, x, *incx, beta, y, *incy);
+          hipblasDgemv(device.get_handle(), hipblas_trans(*trans), *m, *n,
+                       alpha, A, *lda, x, *incx, beta, y, *incy);
       expect(success == 0);
     }
     else if constexpr (std::is_same<P, float>::value)
     {
       auto const success =
-          cublasSgemv(device.get_handle(), cublas_trans(*trans), *m, *n, alpha,
-                      A, *lda, x, *incx, beta, y, *incy);
+          hipblasSgemv(device.get_handle(), hipblas_trans(*trans), *m, *n,
+                       alpha, A, *lda, x, *incx, beta, y, *incy);
       expect(success == 0);
     }
     return;
@@ -627,23 +627,23 @@ void gemm(char const *transa, char const *transb, int *m, int *n, int *k,
   if (resrc == resource::device)
   {
     // device-specific specialization if needed
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
     // no non-fp blas on device
     expect(std::is_floating_point_v<P>);
 
     // instantiated for these two fp types
     if constexpr (std::is_same<P, double>::value)
     {
-      auto const success = cublasDgemm(
-          device.get_handle(), cublas_trans(*transa), cublas_trans(*transb), *m,
-          *n, *k, alpha, A, *lda, B, *ldb, beta, C, *ldc);
+      auto const success = hipblasDgemm(
+          device.get_handle(), hipblas_trans(*transa), hipblas_trans(*transb),
+          *m, *n, *k, alpha, A, *lda, B, *ldb, beta, C, *ldc);
       expect(success == 0);
     }
     else if constexpr (std::is_same<P, float>::value)
     {
-      auto const success = cublasSgemm(
-          device.get_handle(), cublas_trans(*transa), cublas_trans(*transb), *m,
-          *n, *k, alpha, A, *lda, B, *ldb, beta, C, *ldc);
+      auto const success = hipblasSgemm(
+          device.get_handle(), hipblas_trans(*transa), hipblas_trans(*transb),
+          *m, *n, *k, alpha, A, *lda, B, *ldb, beta, C, *ldc);
       expect(success == 0);
     }
     return;
@@ -686,7 +686,7 @@ void getrf(int *m, int *n, P *A, int *lda, int *ipiv, int *info,
   if (resrc == resource::device)
   {
     // device-specific specialization if needed
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
 
     // no non-fp blas on device
     expect(std::is_floating_point_v<P>);
@@ -694,22 +694,22 @@ void getrf(int *m, int *n, P *A, int *lda, int *ipiv, int *info,
     ignore(m);
 
     P **A_d;
-    auto stat = cudaMalloc((void **)&A_d, sizeof(P *));
+    auto stat = hipMalloc((void **)&A_d, sizeof(P *));
     expect(stat == 0);
-    stat = cudaMemcpy(A_d, &A, sizeof(P *), cudaMemcpyHostToDevice);
+    stat = hipMemcpy(A_d, &A, sizeof(P *), hipMemcpyHostToDevice);
     expect(stat == 0);
 
     // instantiated for these two fp types
     if constexpr (std::is_same<P, double>::value)
     {
-      auto const success = cublasDgetrfBatched(device.get_handle(), *n, A_d,
-                                               *lda, ipiv, info, 1);
+      auto const success = hipblasDgetrfBatched(device.get_handle(), *n, A_d,
+                                                *lda, ipiv, info, 1);
       expect(success == 0);
     }
     else if constexpr (std::is_same<P, float>::value)
     {
-      auto const success = cublasSgetrfBatched(device.get_handle(), *n, A_d,
-                                               *lda, ipiv, info, 1);
+      auto const success = hipblasSgetrfBatched(device.get_handle(), *n, A_d,
+                                                *lda, ipiv, info, 1);
       expect(success == 0);
     }
     return;
@@ -747,7 +747,7 @@ void getri(int *n, P *A, int *lda, int *ipiv, P *work, int *lwork, int *info,
   if (resrc == resource::device)
   {
     // device-specific specialization if needed
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
 
     // no non-fp blas on device
     expect(std::is_floating_point_v<P>);
@@ -755,28 +755,28 @@ void getri(int *n, P *A, int *lda, int *ipiv, P *work, int *lwork, int *info,
     expect(*lwork == (*n) * (*n));
     ignore(lwork);
 
-    P const **A_d;
+    P **A_d; // hipBlas loses const to DgetriBatched
     P **work_d;
-    auto stat = cudaMalloc((void **)&A_d, sizeof(P *));
+    auto stat = hipMalloc((void **)&A_d, sizeof(P *));
     expect(stat == 0);
-    stat = cudaMalloc((void **)&work_d, sizeof(P *));
+    stat = hipMalloc((void **)&work_d, sizeof(P *));
     expect(stat == 0);
 
-    stat = cudaMemcpy(A_d, &A, sizeof(P *), cudaMemcpyHostToDevice);
+    stat = hipMemcpy(A_d, &A, sizeof(P *), hipMemcpyHostToDevice);
     expect(stat == 0);
-    stat = cudaMemcpy(work_d, &work, sizeof(P *), cudaMemcpyHostToDevice);
+    stat = hipMemcpy(work_d, &work, sizeof(P *), hipMemcpyHostToDevice);
     expect(stat == 0);
 
     // instantiated for these two fp types
     if constexpr (std::is_same<P, double>::value)
     {
-      auto const success = cublasDgetriBatched(
+      auto const success = hipblasDgetriBatched(
           device.get_handle(), *n, A_d, *lda, nullptr, work_d, *n, info, 1);
       expect(success == 0);
     }
     else if constexpr (std::is_same<P, float>::value)
     {
-      auto const success = cublasSgetriBatched(
+      auto const success = hipblasSgetriBatched(
           device.get_handle(), *n, A_d, *lda, nullptr, work_d, *n, info, 1);
       expect(success == 0);
     }
@@ -824,7 +824,7 @@ void batched_gemm(P **const &a, int *lda, char const *transa, P **const &b,
   if (resrc == resource::device)
   {
     // device-specific specialization if needed
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
     // no non-fp blas on device
     expect(std::is_floating_point_v<P>);
 
@@ -833,44 +833,44 @@ void batched_gemm(P **const &a, int *lda, char const *transa, P **const &b,
     P **c_d;
     size_t const list_size = *num_batch * sizeof(P *);
 
-    auto stat = cudaMalloc((void **)&a_d, list_size);
+    auto stat = hipMalloc((void **)&a_d, list_size);
     expect(stat == 0);
-    stat = cudaMalloc((void **)&b_d, list_size);
+    stat = hipMalloc((void **)&b_d, list_size);
     expect(stat == 0);
-    stat = cudaMalloc((void **)&c_d, list_size);
+    stat = hipMalloc((void **)&c_d, list_size);
     expect(stat == 0);
-    stat = cudaMemcpy(a_d, a, list_size, cudaMemcpyHostToDevice);
+    stat = hipMemcpy(a_d, a, list_size, hipMemcpyHostToDevice);
     expect(stat == 0);
-    stat = cudaMemcpy(b_d, b, list_size, cudaMemcpyHostToDevice);
+    stat = hipMemcpy(b_d, b, list_size, hipMemcpyHostToDevice);
     expect(stat == 0);
-    stat = cudaMemcpy(c_d, c, list_size, cudaMemcpyHostToDevice);
+    stat = hipMemcpy(c_d, c, list_size, hipMemcpyHostToDevice);
     expect(stat == 0);
 
     // instantiated for these two fp types
     if constexpr (std::is_same<P, double>::value)
     {
-      auto const success = cublasDgemmBatched(
-          device.get_handle(), cublas_trans(*transa), cublas_trans(*transb), *m,
-          *n, *k, alpha, a_d, *lda, b_d, *ldb, beta, c_d, *ldc, *num_batch);
-      auto const cuda_stat = cudaDeviceSynchronize();
-      expect(cuda_stat == 0);
+      auto const success = hipblasDgemmBatched(
+          device.get_handle(), hipblas_trans(*transa), hipblas_trans(*transb),
+          *m, *n, *k, alpha, a_d, *lda, b_d, *ldb, beta, c_d, *ldc, *num_batch);
+      auto const hip_stat = hipDeviceSynchronize();
+      expect(hip_stat == 0);
       expect(success == 0);
     }
     else if constexpr (std::is_same<P, float>::value)
     {
-      auto const success = cublasSgemmBatched(
-          device.get_handle(), cublas_trans(*transa), cublas_trans(*transb), *m,
-          *n, *k, alpha, a_d, *lda, b_d, *ldb, beta, c_d, *ldc, *num_batch);
-      auto const cuda_stat = cudaDeviceSynchronize();
-      expect(cuda_stat == 0);
+      auto const success = hipblasSgemmBatched(
+          device.get_handle(), hipblas_trans(*transa), hipblas_trans(*transb),
+          *m, *n, *k, alpha, a_d, *lda, b_d, *ldb, beta, c_d, *ldc, *num_batch);
+      auto const hip_stat = hipDeviceSynchronize();
+      expect(hip_stat == 0);
       expect(success == 0);
     }
 
-    stat = cudaFree(a_d);
+    stat = hipFree(a_d);
     expect(stat == 0);
-    stat = cudaFree(b_d);
+    stat = hipFree(b_d);
     expect(stat == 0);
-    stat = cudaFree(c_d);
+    stat = hipFree(c_d);
     expect(stat == 0);
 
     return;
@@ -912,7 +912,7 @@ void batched_gemv(P **const &a, int *lda, char const *trans, P **const &x,
   if (resrc == resource::device)
   {
     // device-specific specialization if needed
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
     // no non-fp blas on device
     expect(std::is_floating_point_v<P>);
     char const transb = 'n';
@@ -929,46 +929,46 @@ void batched_gemv(P **const &a, int *lda, char const *trans, P **const &x,
     P **y_d;
     size_t const list_size = *num_batch * sizeof(P *);
 
-    auto stat = cudaMalloc((void **)&a_d, list_size);
+    auto stat = hipMalloc((void **)&a_d, list_size);
     expect(stat == 0);
-    stat = cudaMalloc((void **)&x_d, list_size);
+    stat = hipMalloc((void **)&x_d, list_size);
     expect(stat == 0);
-    stat = cudaMalloc((void **)&y_d, list_size);
+    stat = hipMalloc((void **)&y_d, list_size);
     expect(stat == 0);
-    stat = cudaMemcpy(a_d, a, list_size, cudaMemcpyHostToDevice);
+    stat = hipMemcpy(a_d, a, list_size, hipMemcpyHostToDevice);
     expect(stat == 0);
-    stat = cudaMemcpy(x_d, x, list_size, cudaMemcpyHostToDevice);
+    stat = hipMemcpy(x_d, x, list_size, hipMemcpyHostToDevice);
     expect(stat == 0);
-    stat = cudaMemcpy(y_d, y, list_size, cudaMemcpyHostToDevice);
+    stat = hipMemcpy(y_d, y, list_size, hipMemcpyHostToDevice);
     expect(stat == 0);
 
     // instantiated for these two fp types
     if constexpr (std::is_same<P, double>::value)
     {
-      auto const success = cublasDgemmBatched(
-          device.get_handle(), cublas_trans(*trans), cublas_trans(transb),
+      auto const success = hipblasDgemmBatched(
+          device.get_handle(), hipblas_trans(*trans), hipblas_trans(transb),
           gemm_m, gemm_n, gemm_k, alpha, a_d, *lda, x_d, ldb, beta, y_d, ldc,
           *num_batch);
-      auto const cuda_stat = cudaDeviceSynchronize();
-      expect(cuda_stat == 0);
+      auto const hip_stat = hipDeviceSynchronize();
+      expect(hip_stat == 0);
       expect(success == 0);
     }
     else if constexpr (std::is_same<P, float>::value)
     {
-      auto const success = cublasSgemmBatched(
-          device.get_handle(), cublas_trans(*trans), cublas_trans(transb),
+      auto const success = hipblasSgemmBatched(
+          device.get_handle(), hipblas_trans(*trans), hipblas_trans(transb),
           gemm_m, gemm_n, gemm_k, alpha, a_d, *lda, x_d, ldb, beta, y_d, ldc,
           *num_batch);
-      auto const cuda_stat = cudaDeviceSynchronize();
-      expect(cuda_stat == 0);
+      auto const hip_stat = hipDeviceSynchronize();
+      expect(hip_stat == 0);
       expect(success == 0);
     }
 
-    stat = cudaFree(a_d);
+    stat = hipFree(a_d);
     expect(stat == 0);
-    stat = cudaFree(x_d);
+    stat = hipFree(x_d);
     expect(stat == 0);
-    stat = cudaFree(y_d);
+    stat = hipFree(y_d);
     expect(stat == 0);
 
     return;
diff --git a/src/lib_dispatch_tests.cpp b/src/lib_dispatch_tests.cpp
index 87ea0c9ea..8c0eb3ca9 100644
--- a/src/lib_dispatch_tests.cpp
+++ b/src/lib_dispatch_tests.cpp
@@ -793,7 +793,7 @@ TEMPLATE_TEST_CASE("dot product (lib_dispatch::dot)", "[lib_dispatch]", float,
 TEMPLATE_TEST_CASE("device inversion test (lib_dispatch::getrf/getri)",
                    "[lib_dispatch]", float, double)
 {
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
 
   fk::matrix<TestType> const test{{0.767135868133925, -0.641484652834663},
                                   {0.641484652834663, 0.767135868133926}};
@@ -810,7 +810,7 @@ TEMPLATE_TEST_CASE("device inversion test (lib_dispatch::getrf/getri)",
   lib_dispatch::getrf(&m, &n, test_d.data(), &lda, ipiv_d.data(), info_d.data(),
                       resource::device);
 
-  auto stat = cudaDeviceSynchronize();
+  auto stat = hipDeviceSynchronize();
   REQUIRE(stat == 0);
   fk::vector<int> const info_check(info_d.clone_onto_host());
   REQUIRE(info_check(0) == 0);
@@ -822,7 +822,7 @@ TEMPLATE_TEST_CASE("device inversion test (lib_dispatch::getrf/getri)",
   lib_dispatch::getri(&n, test_d.data(), &lda, ipiv_d.data(), work.data(),
                       &size, info_d.data(), resource::device);
 
-  stat = cudaDeviceSynchronize();
+  stat = hipDeviceSynchronize();
   REQUIRE(stat == 0);
   fk::vector<int> const info_check_2(info_d.clone_onto_host());
   REQUIRE(info_check_2(0) == 0);
@@ -1179,7 +1179,11 @@ TEMPLATE_TEST_CASE_SIG("batched gemv", "[lib_dispatch]",
                        (double, resource::host), (double, resource::device),
                        (float, resource::host), (float, resource::device))
 {
-  TestType const tol_factor = 1e-18;
+  TestType tol_factor = 1e-18;
+  if constexpr (resrc == resource::device)
+  {
+    tol_factor = 1e-7;
+  }
 
   SECTION("batched gemv: no trans, alpha = 1.0, beta = 0.0")
   {
diff --git a/src/program_options.cpp b/src/program_options.cpp
index 548c17f3d..6663686ef 100644
--- a/src/program_options.cpp
+++ b/src/program_options.cpp
@@ -211,7 +211,7 @@ parser::parser(int argc, char **argv)
     }
   }
 
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
   if (use_implicit_stepping)
   {
     std::cerr << "GPU acceleration not implemented for implicit stepping\n";
diff --git a/src/tensors.hpp b/src/tensors.hpp
index 90eecd8c7..0fff81f60 100644
--- a/src/tensors.hpp
+++ b/src/tensors.hpp
@@ -1,8 +1,8 @@
 #pragma once
 #include "build_info.hpp"
 
-#ifdef ASGARD_USE_CUDA
-#include <cuda_runtime.h>
+#ifdef ASGARD_USE_HIP
+#include <hip/hip_runtime.h>
 #endif
 
 #include "lib_dispatch.hpp"
@@ -639,9 +639,9 @@ template<typename P>
 inline void
 allocate_device(P *&ptr, int const num_elems, bool const initialize = true)
 {
-#ifdef ASGARD_USE_CUDA
-  auto success = cudaMalloc((void **)&ptr, num_elems * sizeof(P));
-  assert(success == cudaSuccess);
+#ifdef ASGARD_USE_HIP
+  auto success = hipMalloc((void **)&ptr, num_elems * sizeof(P));
+  assert(success == hipSuccess);
   if (num_elems > 0)
   {
     expect(ptr != nullptr);
@@ -649,8 +649,8 @@ allocate_device(P *&ptr, int const num_elems, bool const initialize = true)
 
   if (initialize)
   {
-    success = cudaMemset((void *)ptr, 0, num_elems * sizeof(P));
-    expect(success == cudaSuccess);
+    success = hipMemset((void *)ptr, 0, num_elems * sizeof(P));
+    expect(success == hipSuccess);
   }
 
 #else
@@ -668,12 +668,12 @@ allocate_device(P *&ptr, int const num_elems, bool const initialize = true)
 template<typename P>
 inline void delete_device(P *const ptr)
 {
-#ifdef ASGARD_USE_CUDA
-  auto const success = cudaFree(ptr);
+#ifdef ASGARD_USE_HIP
+  auto const success = hipFree(ptr);
   // the device runtime may be unloaded at process shut down
   // (when static storage duration destructors are called)
   // returning a cudartUnloading error code.
-  expect((success == cudaSuccess) || (success == cudaErrorCudartUnloading));
+  expect((success == hipSuccess) || (success == hipErrorDeinitialized));
 #else
   delete[] ptr;
 #endif
@@ -683,10 +683,10 @@ template<typename P>
 inline void
 copy_on_device(P *const dest, P const *const source, int const num_elems)
 {
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
   auto const success =
-      cudaMemcpy(dest, source, num_elems * sizeof(P), cudaMemcpyDeviceToDevice);
-  expect(success == cudaSuccess);
+      hipMemcpy(dest, source, num_elems * sizeof(P), hipMemcpyDeviceToDevice);
+  expect(success == hipSuccess);
 #else
   std::copy(source, source + num_elems, dest);
 #endif
@@ -696,10 +696,10 @@ template<typename P>
 inline void
 copy_to_device(P *const dest, P const *const source, int const num_elems)
 {
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
   auto const success =
-      cudaMemcpy(dest, source, num_elems * sizeof(P), cudaMemcpyHostToDevice);
-  expect(success == cudaSuccess);
+      hipMemcpy(dest, source, num_elems * sizeof(P), hipMemcpyHostToDevice);
+  expect(success == hipSuccess);
 #else
   std::copy(source, source + num_elems, dest);
 #endif
@@ -709,10 +709,10 @@ template<typename P>
 inline void
 copy_to_host(P *const dest, P const *const source, int const num_elems)
 {
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
   auto const success =
-      cudaMemcpy(dest, source, num_elems * sizeof(P), cudaMemcpyDeviceToHost);
-  expect(success == cudaSuccess);
+      hipMemcpy(dest, source, num_elems * sizeof(P), hipMemcpyDeviceToHost);
+  expect(success == hipSuccess);
 #else
   std::copy(source, source + num_elems, dest);
 #endif
@@ -726,11 +726,14 @@ copy_matrix_on_device(fk::matrix<P, mem, resource::device> &dest,
   expect(source.nrows() == dest.nrows());
   expect(source.ncols() == dest.ncols());
 
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
+  // on AMD, hipMemcpy2D will give throw an error if dpitch or spitch is 0
+  if (source.stride() == 0)
+    return;
   auto const success =
-      cudaMemcpy2D(dest.data(), dest.stride() * sizeof(P), source.data(),
-                   source.stride() * sizeof(P), source.nrows() * sizeof(P),
-                   source.ncols(), cudaMemcpyDeviceToDevice);
+      hipMemcpy2D(dest.data(), dest.stride() * sizeof(P), source.data(),
+                  source.stride() * sizeof(P), source.nrows() * sizeof(P),
+                  source.ncols(), hipMemcpyDeviceToDevice);
   expect(success == 0);
 #else
   std::copy(source.begin(), source.end(), dest.begin());
@@ -745,11 +748,14 @@ copy_matrix_to_device(fk::matrix<P, mem, resource::device> &dest,
 {
   expect(source.nrows() == dest.nrows());
   expect(source.ncols() == dest.ncols());
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
+  // on AMD, hipMemcpy2D will give throw an error if dpitch or spitch is 0
+  if (source.stride() == 0)
+    return;
   auto const success =
-      cudaMemcpy2D(dest.data(), dest.stride() * sizeof(P), source.data(),
-                   source.stride() * sizeof(P), source.nrows() * sizeof(P),
-                   source.ncols(), cudaMemcpyHostToDevice);
+      hipMemcpy2D(dest.data(), dest.stride() * sizeof(P), source.data(),
+                  source.stride() * sizeof(P), source.nrows() * sizeof(P),
+                  source.ncols(), hipMemcpyHostToDevice);
   expect(success == 0);
 #else
   std::copy(source.begin(), source.end(), dest.begin());
@@ -764,11 +770,14 @@ copy_matrix_to_host(fk::matrix<P, mem, resource::host> &dest,
 {
   expect(source.nrows() == dest.nrows());
   expect(source.ncols() == dest.ncols());
-#ifdef ASGARD_USE_CUDA
+#ifdef ASGARD_USE_HIP
+  // on AMD, hipMemcpy2D will give throw an error if dpitch or spitch is 0
+  if (source.stride() == 0)
+    return;
   auto const success =
-      cudaMemcpy2D(dest.data(), dest.stride() * sizeof(P), source.data(),
-                   source.stride() * sizeof(P), source.nrows() * sizeof(P),
-                   source.ncols(), cudaMemcpyDeviceToHost);
+      hipMemcpy2D(dest.data(), dest.stride() * sizeof(P), source.data(),
+                  source.stride() * sizeof(P), source.nrows() * sizeof(P),
+                  source.ncols(), hipMemcpyDeviceToHost);
   expect(success == 0);
 #else
   std::copy(source.begin(), source.end(), dest.begin());