diff --git a/CMakeLists.txt b/CMakeLists.txt index 2d7497490..7596fae92 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -35,8 +35,15 @@ if (${CMAKE_SYSTEM_NAME} MATCHES "Windows") list(APPEND SPLA_DEFINES SPLA_TARGET_WINDOWS) elseif (${CMAKE_SYSTEM_NAME} MATCHES "Linux") set(SPLA_TARGET_LINUX YES) - set(SPLA_ARCH "x64") set(SPLA_EXT "so") + if (NOT SPLA_ARCH) + if (CMAKE_SYSTEM_PROCESSOR MATCHES riscv) + set(SPLA_ARCH "riscv") + else () + set(SPLA_ARCH "x64") + endif () + endif() + message(STATUS "Build Linux binaries for ${SPLA_ARCH} architecture") list(APPEND SPLA_DEFINES SPLA_TARGET_LINUX) elseif (${CMAKE_SYSTEM_NAME} MATCHES "Darwin") set(SPLA_TARGET_MACOSX YES) @@ -417,4 +424,4 @@ if (SPLA_BUILD_TESTS) COMMENT "Copy ${TARGET_FILE} into test directory") endforeach () endif () -endif () \ No newline at end of file +endif () diff --git a/src/opencl/cl_accelerator.cpp b/src/opencl/cl_accelerator.cpp index ad2c823a3..9d8962162 100644 --- a/src/opencl/cl_accelerator.cpp +++ b/src/opencl/cl_accelerator.cpp @@ -108,6 +108,7 @@ namespace spla { m_is_nvidia = false; m_is_amd = false; m_is_intel = false; + m_is_img = false; if (m_vendor_name.find("Intel") != std::string::npos || m_vendor_name.find("intel") != std::string::npos || @@ -141,6 +142,15 @@ namespace spla { // Likely, it is an integrated amd device if (m_max_wgs <= 256 || m_max_cu == 1) m_wave_size = 16; } + if (m_vendor_name.find("Imagination Technologies") != std::string::npos || + m_vendor_name.find("IMG") != std::string::npos || + m_vendor_name.find("img") != std::string::npos || + m_vendor_id == 0x1010) { + m_vendor_code = VENDOR_CODE_IMG; + m_default_wgs = 32; + m_wave_size = 32; + m_is_img = true; + } if (m_vendor_code.empty()) { LOG_MSG(Status::Error, "failed to match one of the pre-defined vendors"); diff --git a/src/opencl/cl_accelerator.hpp b/src/opencl/cl_accelerator.hpp index ac4842a03..f39d9e521 100644 --- a/src/opencl/cl_accelerator.hpp +++ b/src/opencl/cl_accelerator.hpp @@ -49,6 +49,7 @@ #define VENDOR_CODE_NVIDIA "nvidia" #define VENDOR_CODE_INTEL "intel" #define VENDOR_CODE_AMD "amd" +#define VENDOR_CODE_IMG "img" namespace spla { @@ -96,6 +97,7 @@ namespace spla { [[nodiscard]] bool is_nvidia() const { return m_is_nvidia; } [[nodiscard]] bool is_amd() const { return m_is_amd; } [[nodiscard]] bool is_intel() const { return m_is_intel; } + [[nodiscard]] bool is_img() const { return m_is_img; } private: cl::Platform m_platform; @@ -123,6 +125,7 @@ namespace spla { bool m_is_nvidia = false; bool m_is_amd = false; bool m_is_intel = false; + bool m_is_img = false; ankerl::svector m_queues; }; diff --git a/src/opencl/cl_format_coo_vec.hpp b/src/opencl/cl_format_coo_vec.hpp index c5a924a02..b9ea9527d 100644 --- a/src/opencl/cl_format_coo_vec.hpp +++ b/src/opencl/cl_format_coo_vec.hpp @@ -105,7 +105,8 @@ namespace spla { T* Ax, const CLCooVec& storage, cl::CommandQueue& queue, - bool blocking = true) { + cl_mem_flags staging_flags = CL_MEM_READ_ONLY | CL_MEM_HOST_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, + bool blocking = true) { if (n_values == 0) { LOG_MSG(Status::Ok, "nothing to do"); return; @@ -113,10 +114,9 @@ namespace spla { const std::size_t buffer_size_Ai = n_values * sizeof(uint); const std::size_t buffer_size_Ax = n_values * sizeof(T); - const auto flags = CL_MEM_READ_ONLY | CL_MEM_HOST_READ_ONLY | CL_MEM_ALLOC_HOST_PTR; - cl::Buffer staging_Ai(get_acc_cl()->get_context(), flags, buffer_size_Ai); - cl::Buffer staging_Ax(get_acc_cl()->get_context(), flags, buffer_size_Ax); + cl::Buffer staging_Ai(get_acc_cl()->get_context(), staging_flags, buffer_size_Ai); + cl::Buffer staging_Ax(get_acc_cl()->get_context(), staging_flags, buffer_size_Ax); queue.enqueueCopyBuffer(storage.Ai, staging_Ai, 0, 0, buffer_size_Ai); queue.enqueueCopyBuffer(storage.Ax, staging_Ax, 0, 0, buffer_size_Ax); diff --git a/src/opencl/cl_format_csr.hpp b/src/opencl/cl_format_csr.hpp index 7791dcb8d..ec6e105bb 100644 --- a/src/opencl/cl_format_csr.hpp +++ b/src/opencl/cl_format_csr.hpp @@ -84,16 +84,15 @@ namespace spla { T* Ax, CLCsr& storage, cl::CommandQueue& queue, - bool blocking = true) { + cl_mem_flags staging_flags = CL_MEM_READ_ONLY | CL_MEM_HOST_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, + bool blocking = true) { const std::size_t buffer_size_Ap = (n_rows + 1) * sizeof(uint); const std::size_t buffer_size_Aj = n_values * sizeof(uint); const std::size_t buffer_size_Ax = n_values * sizeof(T); - const auto flags = CL_MEM_READ_ONLY | CL_MEM_HOST_READ_ONLY | CL_MEM_ALLOC_HOST_PTR; - - cl::Buffer staging_Ap(get_acc_cl()->get_context(), flags, buffer_size_Ap); - cl::Buffer staging_Aj(get_acc_cl()->get_context(), flags, buffer_size_Aj); - cl::Buffer staging_Ax(get_acc_cl()->get_context(), flags, buffer_size_Ax); + cl::Buffer staging_Ap(get_acc_cl()->get_context(), staging_flags, buffer_size_Ap); + cl::Buffer staging_Aj(get_acc_cl()->get_context(), staging_flags, buffer_size_Aj); + cl::Buffer staging_Ax(get_acc_cl()->get_context(), staging_flags, buffer_size_Ax); queue.enqueueCopyBuffer(storage.Ap, staging_Ap, 0, 0, buffer_size_Ap); queue.enqueueCopyBuffer(storage.Aj, staging_Aj, 0, 0, buffer_size_Aj); diff --git a/src/opencl/cl_format_dense_vec.hpp b/src/opencl/cl_format_dense_vec.hpp index 4a65349bd..563eac56e 100644 --- a/src/opencl/cl_format_dense_vec.hpp +++ b/src/opencl/cl_format_dense_vec.hpp @@ -77,9 +77,10 @@ namespace spla { T* values, CLDenseVec& storage, cl::CommandQueue& queue, - bool blocking = true) { + cl_mem_flags staging_flags = CL_MEM_READ_ONLY | CL_MEM_HOST_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, + bool blocking = true) { const std::size_t buffer_size = n_rows * sizeof(T); - cl::Buffer staging(get_acc_cl()->get_context(), CL_MEM_READ_ONLY | CL_MEM_HOST_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, buffer_size); + cl::Buffer staging(get_acc_cl()->get_context(), staging_flags, buffer_size); queue.enqueueCopyBuffer(storage.Ax, staging, 0, 0, buffer_size); queue.enqueueReadBuffer(staging, blocking, 0, buffer_size, values); diff --git a/src/storage/storage_manager_matrix.hpp b/src/storage/storage_manager_matrix.hpp index 341c7e459..259de597e 100644 --- a/src/storage/storage_manager_matrix.hpp +++ b/src/storage/storage_manager_matrix.hpp @@ -146,7 +146,15 @@ namespace spla { auto* cl_csr = s.template get>(); auto* cpu_csr = s.template get>(); cpu_csr_resize(s.get_n_rows(), cl_csr->values, *cpu_csr); - cl_csr_read(s.get_n_rows(), cl_csr->values, cpu_csr->Ap.data(), cpu_csr->Aj.data(), cpu_csr->Ax.data(), *cl_csr, cl_acc->get_queue_default()); + if (!cl_acc->is_img()) { + cl_csr_read(s.get_n_rows(), cl_csr->values, cpu_csr->Ap.data(), cpu_csr->Aj.data(), cpu_csr->Ax.data(), *cl_csr, cl_acc->get_queue_default()); + } else { + // On Imagination Technologies devices copying data to staging buffer created with CL_MEM_READ_ONLY flag does not affect this buffer. + // According to the [documentation](https://registry.khronos.org/OpenCL/specs/3.0-unified/html/OpenCL_API.html#clEnqueueCopyBuffer), + // this flag should not affect copying, but you have to create a buffer without this flag to keep it correct. + cl_csr_read(s.get_n_rows(), cl_csr->values, cpu_csr->Ap.data(), cpu_csr->Aj.data(), cpu_csr->Ax.data(), *cl_csr, cl_acc->get_queue_default(), + CL_MEM_HOST_READ_ONLY | CL_MEM_ALLOC_HOST_PTR); + } }); #endif } diff --git a/src/storage/storage_manager_vector.hpp b/src/storage/storage_manager_vector.hpp index 98d7a6e1f..1f26c3477 100644 --- a/src/storage/storage_manager_vector.hpp +++ b/src/storage/storage_manager_vector.hpp @@ -126,7 +126,15 @@ namespace spla { auto* cl_acc = get_acc_cl(); auto* cl_dense = s.template get>(); auto* cpu_dense = s.template get>(); - cl_dense_vec_read(s.get_n_rows(), cpu_dense->Ax.data(), *cl_dense, cl_acc->get_queue_default()); + if (!cl_acc->is_img()) { + cl_dense_vec_read(s.get_n_rows(), cpu_dense->Ax.data(), *cl_dense, cl_acc->get_queue_default()); + } else { + // On Imagination Technologies devices copying data to staging buffer created with CL_MEM_READ_ONLY flag does not affect this buffer. + // According to the [documentation](https://registry.khronos.org/OpenCL/specs/3.0-unified/html/OpenCL_API.html#clEnqueueCopyBuffer), + // this flag should not affect copying, but you have to create a buffer without this flag to keep it correct. + cl_dense_vec_read(s.get_n_rows(), cpu_dense->Ax.data(), *cl_dense, cl_acc->get_queue_default(), + CL_MEM_HOST_READ_ONLY | CL_MEM_ALLOC_HOST_PTR); + } }); manager.register_converter(FormatVector::CpuCoo, FormatVector::AccCoo, [](Storage& s) { auto* cpu_coo = s.template get>(); @@ -138,7 +146,15 @@ namespace spla { auto* cl_coo = s.template get>(); auto* cpu_coo = s.template get>(); cpu_coo_vec_resize(cl_coo->values, *cpu_coo); - cl_coo_vec_read(cl_coo->values, cpu_coo->Ai.data(), cpu_coo->Ax.data(), *cl_coo, cl_acc->get_queue_default()); + if (!cl_acc->is_img()) { + cl_coo_vec_read(cl_coo->values, cpu_coo->Ai.data(), cpu_coo->Ax.data(), *cl_coo, cl_acc->get_queue_default()); + } else { + // On Imagination Technologies devices copying data to staging buffer created with CL_MEM_READ_ONLY flag does not affect this buffer. + // According to the [documentation](https://registry.khronos.org/OpenCL/specs/3.0-unified/html/OpenCL_API.html#clEnqueueCopyBuffer), + // this flag should not affect copying, but you have to create a buffer without this flag to keep it correct. + cl_coo_vec_read(cl_coo->values, cpu_coo->Ai.data(), cpu_coo->Ax.data(), *cl_coo, cl_acc->get_queue_default(), + CL_MEM_HOST_READ_ONLY | CL_MEM_ALLOC_HOST_PTR); + } }); manager.register_converter(FormatVector::AccCoo, FormatVector::AccDense, [](Storage& s) { auto* cl_acc = get_acc_cl();