diff --git a/clic/clic.hpp.in b/clic/clic.hpp.in index fcaba8901..6e303e3e0 100644 --- a/clic/clic.hpp.in +++ b/clic/clic.hpp.in @@ -23,4 +23,10 @@ # include #endif +#if USE_OPENCL + const constexpr size_t GPU_MEM_PTR_SIZE = sizeof(cl_mem); +#else + const constexpr size_t GPU_MEM_PTR_SIZE = sizeof(void *); +#endif + #endif // __CLIC_HPP diff --git a/clic/include/array.hpp b/clic/include/array.hpp index c5fd85ad4..88335c906 100644 --- a/clic/include/array.hpp +++ b/clic/include/array.hpp @@ -5,9 +5,6 @@ #include "device.hpp" #include "utils.hpp" -#include -#include - namespace cle { @@ -22,22 +19,22 @@ class Array return std::shared_ptr(new Array()); } static auto - create(const size_t & width, - const size_t & height, - const size_t & depth, + create(size_t width, + size_t height, + size_t depth, const dType & data_type, const mType & mem_type, const Device::Pointer & device_ptr) -> Array::Pointer; static auto - create(const size_t & width, - const size_t & height, - const size_t & depth, + create(size_t width, + size_t height, + size_t depth, const dType & data_type, const mType & mem_type, const void * host_data, const Device::Pointer & device_ptr) -> Array::Pointer; static auto - create(Array::Pointer array) -> Array::Pointer; + create(const Array::Pointer & array) -> Array::Pointer; friend auto operator<<(std::ostream & out, const Array::Pointer & array) -> std::ostream &; @@ -52,7 +49,7 @@ class Array write(const void * host_data, const std::array & region, const std::array & buffer_origin) -> void; auto - write(const void * host_data, const size_t & x_coord, const size_t & y_coord, const size_t & z_coord) -> void; + write(const void * host_data, size_t x_coord, size_t y_coord, size_t z_coord) -> void; auto read(void * host_data) const -> void; @@ -60,13 +57,18 @@ class Array read(void * host_data, const std::array & region, const std::array & buffer_origin) const -> void; auto - read(void * host_data, const size_t & x_coord, const size_t & y_coord, const size_t & z_coord) const -> void; + read(void * host_data, size_t x_coord, size_t y_coord, size_t z_coord) const -> void; auto copy(const Array::Pointer & dst) const -> void; + auto + copy(const Array::Pointer & dst, + const std::array & region, + const std::array & src_origin, + const std::array & dst_origin) const -> void; auto - fill(const float & value) const -> void; + fill(float value) const -> void; [[nodiscard]] auto size() const -> size_t; @@ -102,9 +104,9 @@ class Array using MemoryPointer = std::shared_ptr; Array() = default; - Array(const size_t & width, - const size_t & height, - const size_t & depth, + Array(size_t width, + size_t height, + size_t depth, const dType & data_type, const mType & mem_type, const Device::Pointer & device_ptr); diff --git a/clic/include/backend.hpp b/clic/include/backend.hpp index af8e6b44b..ceb4060de 100644 --- a/clic/include/backend.hpp +++ b/clic/include/backend.hpp @@ -6,8 +6,7 @@ #include "utils.hpp" #include -#include -#include +#include #include namespace cle @@ -74,33 +73,45 @@ class Backend virtual auto - copyMemoryBufferToBuffer(const Device::Pointer & device, - const void ** src_data_ptr, - const std::array & region, - const std::array & origin, - const size_t & bytes, - void ** dst_data_ptr) const -> void = 0; + copyMemoryBufferToBuffer(const Device::Pointer & device, + const void ** src_ptr, + std::array & src_origin, + std::array & src_shape, + void ** dst_ptr, + std::array & dst_origin, + std::array & dst_shape, + std::array & region, + const size_t & bytes) const -> void = 0; virtual auto - copyMemoryImageToBuffer(const Device::Pointer & device, - const void ** src_data_ptr, - const std::array & region, - const std::array & origin, - const size_t & bytes, - void ** dst_data_ptr) const -> void = 0; + copyMemoryImageToBuffer(const Device::Pointer & device, + const void ** src_ptr, + std::array & src_origin, + std::array & src_shape, + void ** dst_ptr, + std::array & dst_origin, + std::array & dst_shape, + std::array & region, + const size_t & bytes) const -> void = 0; virtual auto - copyMemoryBufferToImage(const Device::Pointer & device, - const void ** src_data_ptr, - const std::array & region, - const std::array & origin, - const size_t & bytes, - void ** dst_data_ptr) const -> void = 0; + copyMemoryBufferToImage(const Device::Pointer & device, + const void ** src_ptr, + std::array & src_origin, + std::array & src_shape, + void ** dst_ptr, + std::array & dst_origin, + std::array & dst_shape, + std::array & region, + const size_t & bytes) const -> void = 0; virtual auto - copyMemoryImageToImage(const Device::Pointer & device, - const void ** src_data_ptr, - const std::array & region, - const std::array & origin, - const size_t & bytes, - void ** dst_data_ptr) const -> void = 0; + copyMemoryImageToImage(const Device::Pointer & device, + const void ** src_ptr, + std::array & src_origin, + std::array & src_shape, + void ** dst_ptr, + std::array & dst_origin, + std::array & dst_shape, + std::array & region, + const size_t & bytes) const -> void = 0; virtual auto setMemory(const Device::Pointer & device, @@ -226,33 +237,45 @@ class CUDABackend : public Backend void * host_ptr) const -> void override; auto - copyMemoryBufferToBuffer(const Device::Pointer & device, - const void ** src_data_ptr, - const std::array & region, - const std::array & origin, - const size_t & bytes, - void ** dst_data_ptr) const -> void override; - auto - copyMemoryImageToBuffer(const Device::Pointer & device, - const void ** src_data_ptr, - const std::array & region, - const std::array & origin, - const size_t & bytes, - void ** dst_data_ptr) const -> void override; - auto - copyMemoryBufferToImage(const Device::Pointer & device, - const void ** src_data_ptr, - const std::array & region, - const std::array & origin, - const size_t & bytes, - void ** dst_data_ptr) const -> void override; - auto - copyMemoryImageToImage(const Device::Pointer & device, - const void ** src_data_ptr, - const std::array & region, - const std::array & origin, - const size_t & bytes, - void ** dst_data_ptr) const -> void override; + copyMemoryBufferToBuffer(const Device::Pointer & device, + const void ** src_ptr, + std::array & src_origin, + std::array & src_shape, + void ** dst_ptr, + std::array & dst_origin, + std::array & dst_shape, + std::array & region, + const size_t & bytes) const -> void override; + auto + copyMemoryImageToBuffer(const Device::Pointer & device, + const void ** src_ptr, + std::array & src_origin, + std::array & src_shape, + void ** dst_ptr, + std::array & dst_origin, + std::array & dst_shape, + std::array & region, + const size_t & bytes) const -> void override; + auto + copyMemoryBufferToImage(const Device::Pointer & device, + const void ** src_ptr, + std::array & src_origin, + std::array & src_shape, + void ** dst_ptr, + std::array & dst_origin, + std::array & dst_shape, + std::array & region, + const size_t & bytes) const -> void override; + auto + copyMemoryImageToImage(const Device::Pointer & device, + const void ** src_ptr, + std::array & src_origin, + std::array & src_shape, + void ** dst_ptr, + std::array & dst_origin, + std::array & dst_shape, + std::array & region, + const size_t & bytes) const -> void override; auto setMemory(const Device::Pointer & device, @@ -383,37 +406,47 @@ class OpenCLBackend : public Backend const mType & mtype, void * host_ptr) const -> void override; - auto - copyMemoryBufferToBuffer(const Device::Pointer & device, - const void ** src_data_ptr, - const std::array & region, - const std::array & origin, - const size_t & bytes, - void ** dst_data_ptr) const -> void override; - - auto - copyMemoryImageToBuffer(const Device::Pointer & device, - const void ** src_data_ptr, - const std::array & region, - const std::array & origin, - const size_t & bytes, - void ** dst_data_ptr) const -> void override; - - auto - copyMemoryBufferToImage(const Device::Pointer & device, - const void ** src_data_ptr, - const std::array & region, - const std::array & origin, - const size_t & bytes, - void ** dst_data_ptr) const -> void override; auto - copyMemoryImageToImage(const Device::Pointer & device, - const void ** src_data_ptr, - const std::array & region, - const std::array & origin, - const size_t & bytes, - void ** dst_data_ptr) const -> void override; + copyMemoryBufferToBuffer(const Device::Pointer & device, + const void ** src_ptr, + std::array & src_origin, + std::array & src_shape, + void ** dst_ptr, + std::array & dst_origin, + std::array & dst_shape, + std::array & region, + const size_t & bytes) const -> void override; + auto + copyMemoryImageToBuffer(const Device::Pointer & device, + const void ** src_ptr, + std::array & src_origin, + std::array & src_shape, + void ** dst_ptr, + std::array & dst_origin, + std::array & dst_shape, + std::array & region, + const size_t & bytes) const -> void override; + auto + copyMemoryBufferToImage(const Device::Pointer & device, + const void ** src_ptr, + std::array & src_origin, + std::array & src_shape, + void ** dst_ptr, + std::array & dst_origin, + std::array & dst_shape, + std::array & region, + const size_t & bytes) const -> void override; + auto + copyMemoryImageToImage(const Device::Pointer & device, + const void ** src_ptr, + std::array & src_origin, + std::array & src_shape, + void ** dst_ptr, + std::array & dst_origin, + std::array & dst_shape, + std::array & region, + const size_t & bytes) const -> void override; auto setMemory(const Device::Pointer & device, diff --git a/clic/include/device.hpp b/clic/include/device.hpp index 38edd8cf4..3d0789ae3 100644 --- a/clic/include/device.hpp +++ b/clic/include/device.hpp @@ -1,12 +1,12 @@ #ifndef __INCLUDE_DEVICE_HPP #define __INCLUDE_DEVICE_HPP +#include "clic.hpp" + #include -#include #include #include - -#include "clic.hpp" +#include namespace cle { @@ -23,12 +23,6 @@ class Device Device() = default; virtual ~Device() = default; - Device(const Device &) = default; - Device(Device &&) = default; - auto - operator=(const Device &) -> Device & = default; - auto - operator=(Device &&) -> Device & = default; virtual auto initialize() -> void = 0; @@ -37,7 +31,7 @@ class Device virtual auto finish() const -> void = 0; virtual auto - setWaitToFinish(bool) -> void = 0; + setWaitToFinish(bool flag) -> void = 0; [[nodiscard]] virtual auto isInitialized() const -> bool = 0; @@ -75,14 +69,10 @@ class Device class OpenCLDevice : public Device { public: + using CacheType = std::unordered_map; + OpenCLDevice(const cl_platform_id & platform, const cl_device_id & device); ~OpenCLDevice() override; - OpenCLDevice(const OpenCLDevice &) = default; - OpenCLDevice(OpenCLDevice &&) = default; - auto - operator=(const OpenCLDevice &) -> OpenCLDevice & = default; - auto - operator=(OpenCLDevice &&) -> OpenCLDevice & = default; auto initialize() -> void override; @@ -91,7 +81,7 @@ class OpenCLDevice : public Device auto finish() const -> void override; auto - setWaitToFinish(bool) -> void override; + setWaitToFinish(bool flag) -> void override; [[nodiscard]] auto getType() const -> Device::Type override; @@ -110,16 +100,16 @@ class OpenCLDevice : public Device [[nodiscard]] auto getInfo() const -> std::string override; [[nodiscard]] auto - getCache() -> std::map &; + getCache() -> CacheType &; private: - cl_device_id clDevice; - cl_platform_id clPlatform; - cl_context clContext; - cl_command_queue clCommandQueue; - std::map cache; - bool initialized = false; - bool waitFinish = false; + cl_device_id clDevice; + cl_platform_id clPlatform; + cl_context clContext; + cl_command_queue clCommandQueue; + CacheType cache; + bool initialized = false; + bool waitFinish = false; }; #endif // USE_OPENCL @@ -127,14 +117,10 @@ class OpenCLDevice : public Device class CUDADevice : public Device { public: + using CacheType = std::unordered_map; + explicit CUDADevice(int deviceIndex); ~CUDADevice() override; - CUDADevice(const CUDADevice &) = default; - CUDADevice(CUDADevice &&) = default; - auto - operator=(const CUDADevice &) -> CUDADevice & = default; - auto - operator=(CUDADevice &&) -> CUDADevice & = default; auto initialize() -> void override; @@ -143,7 +129,7 @@ class CUDADevice : public Device auto finish() const -> void override; auto - setWaitToFinish(bool) -> void override; + setWaitToFinish(bool flag) -> void override; [[nodiscard]] auto getType() const -> Device::Type override; @@ -164,16 +150,16 @@ class CUDADevice : public Device [[nodiscard]] auto getArch() const -> std::string; [[nodiscard]] auto - getCache() -> std::map &; + getCache() -> CacheType &; private: - int cudaDeviceIndex; - CUdevice cudaDevice; - CUcontext cudaContext; - CUstream cudaStream; - bool initialized = false; - bool waitFinish = false; - std::map cache; + int cudaDeviceIndex; + CUdevice cudaDevice; + CUcontext cudaContext; + CUstream cudaStream; + bool initialized = false; + bool waitFinish = false; + CacheType cache; }; #endif // USE_CUDA diff --git a/clic/include/execution.hpp b/clic/include/execution.hpp index 87b1fd908..8fa747bb7 100644 --- a/clic/include/execution.hpp +++ b/clic/include/execution.hpp @@ -18,15 +18,6 @@ using ConstantList = std::vector>; using KernelInfo = std::pair; using RangeArray = std::array; -auto -translateOpenclToCuda(std::string & code) -> void; - -auto -cudaDefines(const ParameterList & parameter_list, const ConstantList & constant_list) -> std::string; - -auto -oclDefines(const ParameterList & parameter_list, const ConstantList & constant_list) -> std::string; - auto execute(const Device::Pointer & device, const KernelInfo & kernel_func, @@ -41,9 +32,6 @@ native_execute(const Device::Pointer & device, const RangeArray & global_range = { 1, 1, 1 }, const RangeArray & local_range = { 1, 1, 1 }) -> void; -auto -loadSource(const std::string & source_path) -> std::string; - } // namespace cle #endif // __INCLUDE_EXECUTION_HPP diff --git a/clic/include/utils.hpp b/clic/include/utils.hpp index 4c6fe0029..2f13ba069 100644 --- a/clic/include/utils.hpp +++ b/clic/include/utils.hpp @@ -1,14 +1,9 @@ #ifndef __INCLUDE_UTILS_HPP #define __INCLUDE_UTILS_HPP -#include +#include #include -#include -#include #include -#include - -#include #ifndef M_PI # define M_PI 3.14159265358979323846 /* pi */ #endif @@ -241,24 +236,42 @@ correct_range(int * start, int * stop, int * step, int size) -> void { // # set in case not set (passed None) if (step == nullptr) + { *step = 1; + } if (start == nullptr) + { *start = (*step >= 0) ? 0 : size - 1; + } if (stop == nullptr) + { *stop = (*step >= 0) ? size : -1; + } // # Check if ranges make sense if (*start >= size) + { *start = (*step >= 0) ? size : size - 1; + } if (*start < -size + 1) + { *start = -size + 1; + } if (*stop > size) + { *stop = size; + } if (*stop < -size) + { *stop = (*start > 0) ? 0 - 1 : -size; + } if (*start < 0) + { *start = size - *start; + } if ((*start > *stop && *step > 0) || (*start < *stop && *step < 0)) + { *stop = *start; + } } diff --git a/clic/src/array.cpp b/clic/src/array.cpp index 5301980ad..51554a1ac 100644 --- a/clic/src/array.cpp +++ b/clic/src/array.cpp @@ -1,28 +1,24 @@ #include "array.hpp" + #include namespace cle { -Array::Array(const size_t & width, - const size_t & height, - const size_t & depth, +Array::Array(const size_t width, + const size_t height, + const size_t depth, const dType & data_type, const mType & mem_type, const Device::Pointer & device_ptr) - : width_(width) - , height_(height) - , depth_(depth) + : width_((width > 1) ? width : 1) + , height_((height > 1) ? height : 1) + , depth_((depth > 1) ? depth : 1) , dataType_(data_type) , memType_(mem_type) , device_(device_ptr) , data_(std::make_shared(nullptr)) - , initialized_(false) -{ - width_ = (width_ > 1) ? width_ : 1; - height_ = (height_ > 1) ? height_ : 1; - depth_ = (depth_ > 1) ? depth_ : 1; -} +{} Array::~Array() { @@ -33,9 +29,9 @@ Array::~Array() } auto -Array::create(const size_t & width, - const size_t & height, - const size_t & depth, +Array::create(const size_t width, + const size_t height, + const size_t depth, const dType & data_type, const mType & mem_type, const Device::Pointer & device_ptr) -> Array::Pointer @@ -46,9 +42,9 @@ Array::create(const size_t & width, } auto -Array::create(const size_t & width, - const size_t & height, - const size_t & depth, +Array::create(const size_t width, + const size_t height, + const size_t depth, const dType & data_type, const mType & mem_type, const void * host_data, @@ -60,7 +56,7 @@ Array::create(const size_t & width, } auto -Array::create(Array::Pointer array) -> Array::Pointer +Array::create(const Array::Pointer & array) -> Array::Pointer { auto ptr = create(array->width(), array->height(), array->depth(), array->dtype(), array->mtype(), array->device()); array->copy(ptr); @@ -80,7 +76,6 @@ Array::allocate() -> void { if (initialized()) { - std::cerr << "Warning: Array is already initialized" << std::endl; return; } backend_.allocateMemory(device(), { this->width(), this->height(), this->depth() }, dtype(), mtype(), get()); @@ -92,7 +87,7 @@ Array::write(const void * host_data) -> void { if (host_data == nullptr) { - throw std::runtime_error("Error: host_data is null"); + throw std::runtime_error("Error: Host data is null"); } if (!initialized()) { @@ -110,7 +105,7 @@ Array::write(const void * host_data, const std::array & region, const { if (host_data == nullptr) { - throw std::runtime_error("Error: host_data is null"); + throw std::runtime_error("Error: Host data is null"); } if (!initialized()) { @@ -123,7 +118,7 @@ Array::write(const void * host_data, const std::array & region, const } auto -Array::write(const void * host_data, const size_t & x_coord, const size_t & y_coord, const size_t & z_coord) -> void +Array::write(const void * host_data, const size_t x_coord, const size_t y_coord, const size_t z_coord) -> void { write(host_data, { 1, 1, 1 }, { x_coord, y_coord, z_coord }); } @@ -133,7 +128,7 @@ Array::read(void * host_data) const -> void { if (host_data == nullptr) { - throw std::runtime_error("Error: host_data is null"); + throw std::runtime_error("Error: Host data is null"); } if (!initialized()) { @@ -151,7 +146,7 @@ Array::read(void * host_data, const std::array & region, const std::a { if (host_data == nullptr) { - throw std::runtime_error("Error: host_data is null"); + throw std::runtime_error("Error: Host data is null"); } if (!initialized()) { @@ -164,7 +159,7 @@ Array::read(void * host_data, const std::array & region, const std::a } auto -Array::read(void * host_data, const size_t & x_coord, const size_t & y_coord, const size_t & z_coord) const -> void +Array::read(void * host_data, const size_t x_coord, const size_t y_coord, const size_t z_coord) const -> void { read(host_data, { 1, 1, 1 }, { x_coord, y_coord, z_coord }); } @@ -174,48 +169,99 @@ Array::copy(const Array::Pointer & dst) const -> void { if (!initialized() || !dst->initialized()) { - std::cerr << "Error: Arrays are not initialized_" << std::endl; + throw std::runtime_error("Error: Arrays are not initialized_"); } if (device() != dst->device()) { - std::cerr << "Error: copying Arrays from different devices" << std::endl; + throw std::runtime_error("Error: Copying Arrays from different devices"); } if (width() != dst->width() || height() != dst->height() || depth() != dst->depth() || itemSize() != dst->itemSize()) { - std::cerr << "Error: Arrays dimensions do not match" << std::endl; + throw std::runtime_error("Error: Arrays dimensions do not match"); } + std::array _src_origin = { 0, 0, 0 }; + std::array _dst_origin = { 0, 0, 0 }; + std::array _region = { this->width(), this->height(), this->depth() }; + std::array _src_shape = { this->width(), this->height(), this->depth() }; + std::array _dst_shape = { dst->width(), dst->height(), dst->depth() }; if (mtype() == mType::BUFFER && dst->mtype() == mType::BUFFER) { backend_.copyMemoryBufferToBuffer( - device(), c_get(), { width(), height(), depth() }, { 0, 0, 0 }, toBytes(dtype()), dst->get()); + device(), c_get(), _src_origin, _src_shape, dst->get(), _dst_origin, _dst_shape, _region, toBytes(dtype())); } else if (mtype() == mType::IMAGE && dst->mtype() == mType::IMAGE) { backend_.copyMemoryImageToImage( - device(), c_get(), { width(), height(), depth() }, { 0, 0, 0 }, toBytes(dtype()), dst->get()); + device(), c_get(), _src_origin, _src_shape, dst->get(), _dst_origin, _dst_shape, _region, toBytes(dtype())); } else if (mtype() == mType::BUFFER && dst->mtype() == mType::IMAGE) { backend_.copyMemoryBufferToImage( - device(), c_get(), { width(), height(), depth() }, { 0, 0, 0 }, toBytes(dtype()), dst->get()); + device(), c_get(), _src_origin, _src_shape, dst->get(), _dst_origin, _dst_shape, _region, toBytes(dtype())); } else if (mtype() == mType::IMAGE && dst->mtype() == mType::BUFFER) { backend_.copyMemoryImageToBuffer( - device(), c_get(), { width(), height(), depth() }, { 0, 0, 0 }, toBytes(dtype()), dst->get()); + device(), c_get(), _src_origin, _src_shape, dst->get(), _dst_origin, _dst_shape, _region, toBytes(dtype())); } else { - std::cerr << "Error: copying Arrays from different memory types" << std::endl; + throw std::runtime_error("Error: Copying Arrays from different memory types"); + } +} + +auto +Array::copy(const Array::Pointer & dst, + const std::array & region, + const std::array & src_origin, + const std::array & dst_origin) const -> void +{ + if (!initialized() || !dst->initialized()) + { + throw std::runtime_error("Error: Arrays are not initialized_"); + } + if (device() != dst->device()) + { + throw std::runtime_error("Error: Copying Arrays from different devices"); + } + if (width() != dst->width() || height() != dst->height() || depth() != dst->depth() || itemSize() != dst->itemSize()) + { + throw std::runtime_error("Error: Arrays dimensions do not match"); + } + std::array _src_origin = src_origin; + std::array _dst_origin = dst_origin; + std::array _region = region; + std::array _src_shape = { this->width(), this->height(), this->depth() }; + std::array _dst_shape = { dst->width(), dst->height(), dst->depth() }; + + if (mtype() == mType::BUFFER && dst->mtype() == mType::BUFFER) + { + backend_.copyMemoryBufferToBuffer( + device(), c_get(), _src_origin, _src_shape, dst->get(), _dst_origin, _dst_shape, _region, toBytes(dtype())); + } + else if (mtype() == mType::IMAGE && dst->mtype() == mType::IMAGE) + { + backend_.copyMemoryImageToImage( + device(), c_get(), _src_origin, _src_shape, dst->get(), _dst_origin, _dst_shape, _region, toBytes(dtype())); + } + else if (mtype() == mType::BUFFER && dst->mtype() == mType::IMAGE) + { + backend_.copyMemoryBufferToImage( + device(), c_get(), _src_origin, _src_shape, dst->get(), _dst_origin, _dst_shape, _region, toBytes(dtype())); + } + else if (mtype() == mType::IMAGE && dst->mtype() == mType::BUFFER) + { + backend_.copyMemoryImageToBuffer( + device(), c_get(), _src_origin, _src_shape, dst->get(), _dst_origin, _dst_shape, _region, toBytes(dtype())); } } auto -Array::fill(const float & value) const -> void +Array::fill(const float value) const -> void { if (!initialized()) { - std::cerr << "Error: Arrays are not initialized_" << std::endl; + throw std::runtime_error("Error: Array it is not initialized."); } std::array _origin = { 0, 0, 0 }; std::array _region = { this->width(), this->height(), this->depth() }; diff --git a/clic/src/cudabackend.cpp b/clic/src/cudabackend.cpp index d1e1a3316..5afb65680 100644 --- a/clic/src/cudabackend.cpp +++ b/clic/src/cudabackend.cpp @@ -1,7 +1,5 @@ #include "backend.hpp" #include "cle_preamble_cu.h" -#include -#include namespace cle { @@ -13,6 +11,22 @@ CUDABackend::CUDABackend() #endif } +#if USE_CUDA +[[nodiscard]] static auto +getErrorString(const CUresult & error) -> std::string +{ + const char * error_string; + cuGetErrorString(error, &error_string); + return std::string(error_string); +} + +[[nodiscard]] static auto +getErrorString(const nvrtcResult & error) -> std::string +{ + return std::string(nvrtcGetErrorString(error)); +} +#endif + auto CUDABackend::getDevices(const std::string & type) const -> std::vector { @@ -21,7 +35,8 @@ CUDABackend::getDevices(const std::string & type) const -> std::vector devices; for (int i = 0; i < deviceCount; i++) @@ -48,7 +63,7 @@ CUDABackend::getDevice(const std::string & name, const std::string & type) const } if (!devices.empty()) { - std::cerr << "WARNING: Device with name '" << name << "' not found. Using default device." << std::endl; + std::cerr << "WARNING: Device with name '" << name << "' not found. Using default device instead." << std::endl; return std::move(devices.back()); } return nullptr; @@ -95,7 +110,7 @@ CUDABackend::allocateMemory(const Device::Pointer & device, break; } case mType::IMAGE: { - // TODO @StRigaud: implement image support for CUDA + // @StRigaud TODO: implement image support for CUDA // allocateImage(device, region, dtype, data_ptr); const size_t size = region[0] * region[1] * region[2] * toBytes(dtype); allocateBuffer(device, size, data_ptr); @@ -115,13 +130,15 @@ CUDABackend::allocateBuffer(const Device::Pointer & device, const size_t & size, auto err = cuCtxSetCurrent(cuda_device->getCUDAContext()); if (err != CUDA_SUCCESS) { - throw std::runtime_error("Error (cuda): Failed to get context from device (" + std::to_string(err) + ")."); + throw std::runtime_error("Error: Fail to get context from device.\nCUDA error : " + getErrorString(err) + " (" + + std::to_string(err) + ")."); } CUdeviceptr mem; err = cuMemAlloc(&mem, size); if (err != CUDA_SUCCESS) { - throw std::runtime_error("Error (cuda): Failed to allocate memory (buffer) with error code " + std::to_string(err)); + throw std::runtime_error("Error: Fail to allocate buffer memory.\nCUDA error : " + getErrorString(err) + " (" + + std::to_string(err) + ")."); } *data_ptr = reinterpret_cast(mem); #else @@ -130,7 +147,7 @@ CUDABackend::allocateBuffer(const Device::Pointer & device, const size_t & size, } /* -// TODO @StRigaud: implement image support for CUDA +// @StRigaud TODO: implement image support for CUDA auto CUDABackend::allocateImage(const Device::Pointer & device, const std::array & region, @@ -142,7 +159,7 @@ CUDABackend::allocateImage(const Device::Pointer & device, auto err = cuCtxSetCurrent(cuda_device->getCUDAContext()); if (err != CUDA_SUCCESS) { - throw std::runtime_error("Error (cuda): Failed to get context from device (" + std::to_string(err) + ")."); + throw std::runtime_error("Error: Fail to get context from device (" + std::to_string(err) + ")."); } CUarray array; CUarray_format format; @@ -196,7 +213,7 @@ CUDABackend::allocateImage(const Device::Pointer & device, } if (err != CUDA_SUCCESS) { - throw std::runtime_error("Error (cuda): Failed to allocate memory (image) with error code " + std::to_string(err)); + throw std::runtime_error("Error: Fail to allocate memory (image) with error code " + std::to_string(err)); } *data_ptr = reinterpret_cast(array); #else @@ -213,21 +230,15 @@ CUDABackend::freeMemory(const Device::Pointer & device, const mType & mtype, voi auto err = cuCtxSetCurrent(cuda_device->getCUDAContext()); if (err != CUDA_SUCCESS) { - throw std::runtime_error("Error (cuda): Failed to get context from device (" + std::to_string(err) + ")."); - } - if (mtype == mType::IMAGE) - { - // TODO @StRigaud: implement image support for CUDA - // err = cuArrayDestroy(reinterpret_cast(*data_ptr)); - err = cuMemFree(reinterpret_cast(*data_ptr)); - } - else - { - err = cuMemFree(reinterpret_cast(*data_ptr)); + throw std::runtime_error("Error: Fail to get context from device.\nCUDA error : " + getErrorString(err) + " (" + + std::to_string(err) + ")."); } + // @StRigaud TODO: implement image support for CUDA + err = cuMemFree(reinterpret_cast(*data_ptr)); if (err != CUDA_SUCCESS) { - throw std::runtime_error("Error (cuda): Failed to free memory with error code " + std::to_string(err) + "."); + throw std::runtime_error("Error: Fail to free memory.\nCUDA error : " + getErrorString(err) + " (" + + std::to_string(err) + ")."); } #else throw std::runtime_error("Error: CUDA is not enabled"); @@ -255,7 +266,7 @@ CUDABackend::writeMemory(const Device::Pointer & device, break; } case mType::IMAGE: { - // TODO @StRigaud: implement image support for CUDA + // @StRigaud TODO: implement image support for CUDA writeBuffer(device, buffer_ptr, buffer_shape, buffer_origin, region, host_ptr); break; } @@ -278,7 +289,8 @@ CUDABackend::writeBuffer(const Device::Pointer & device, auto err = cuCtxSetCurrent(cuda_device->getCUDAContext()); if (err != CUDA_SUCCESS) { - throw std::runtime_error("Error (cuda): Failed to get context from device (" + std::to_string(err) + ")."); + throw std::runtime_error("Error: Fail to get context from device.\nCUDA error : " + getErrorString(err) + " (" + + std::to_string(err) + ")."); } size_t buffer_row_pitch = buffer_shape[1] > 1 ? buffer_shape[0] : 0; @@ -331,8 +343,8 @@ CUDABackend::writeBuffer(const Device::Pointer & device, } if (err != CUDA_SUCCESS) { - throw std::runtime_error("Error (cuda): Failed to write on device (host -> buffer) with error code " + - std::to_string(err)); + throw std::runtime_error("Error: Fail to write on device from host to buffer.\nCUDA error : " + + getErrorString(err) + " (" + std::to_string(err) + ")."); } #else throw std::runtime_error("Error: CUDA is not enabled"); @@ -352,7 +364,8 @@ CUDABackend::readBuffer(const Device::Pointer & device, auto err = cuCtxSetCurrent(cuda_device->getCUDAContext()); if (err != CUDA_SUCCESS) { - throw std::runtime_error("Error (cuda): Failed to get context from device (" + std::to_string(err) + ")."); + throw std::runtime_error("Error: Fail to get context from device.\nCUDA error : " + getErrorString(err) + " (" + + std::to_string(err) + ")."); } size_t buffer_row_pitch = buffer_shape[1] > 1 ? buffer_shape[0] : 0; size_t buffer_slice_pitch = buffer_shape[2] > 1 ? buffer_shape[1] : 0; @@ -405,8 +418,8 @@ CUDABackend::readBuffer(const Device::Pointer & device, } if (err != CUDA_SUCCESS) { - throw std::runtime_error("Error (cuda): Failed to read memory (buffer -> host) with error code " + - std::to_string(err)); + throw std::runtime_error("Error: Fail to read memory from buffer to host.\nCUDA error : " + getErrorString(err) + + " (" + std::to_string(err) + ")."); } #else throw std::runtime_error("Error: CUDA is not enabled"); @@ -434,7 +447,7 @@ CUDABackend::readMemory(const Device::Pointer & device, break; } case mType::IMAGE: { - // TODO @StRigaud: implement image support for CUDA + // @StRigaud TODO: implement image support for CUDA readBuffer(device, buffer_ptr, buffer_shape, buffer_origin, region, host_ptr); break; } @@ -445,69 +458,87 @@ CUDABackend::readMemory(const Device::Pointer & device, } auto -CUDABackend::copyMemoryBufferToBuffer(const Device::Pointer & device, - const void ** src_data_ptr, - const std::array & region, - const std::array & origin, - const size_t & bytes, - void ** dst_data_ptr) const -> void +CUDABackend::copyMemoryBufferToBuffer(const Device::Pointer & device, + const void ** src_ptr, + std::array & src_origin, + std::array & src_shape, + void ** dst_ptr, + std::array & dst_origin, + std::array & dst_shape, + std::array & region, + const size_t & bytes) const -> void { #if USE_CUDA auto cuda_device = std::dynamic_pointer_cast(device); auto err = cuCtxSetCurrent(cuda_device->getCUDAContext()); if (err != CUDA_SUCCESS) { - throw std::runtime_error("Error (cuda): Failed to get context from device (" + std::to_string(err) + ")."); + throw std::runtime_error("Error: Fail to get context from device.\nCUDA error : " + getErrorString(err) + " (" + + std::to_string(err) + ")."); } - if (region[2] > 1) + + region[0] *= bytes; + src_origin[0] *= bytes; + src_shape[0] *= bytes; + dst_origin[0] *= bytes; + dst_shape[0] *= bytes; + + size_t src_row_pitch = src_shape[1] > 1 ? src_shape[0] : 0; + size_t src_slice_pitch = src_shape[2] > 1 ? src_shape[0] * src_shape[1] : 0; + size_t dst_row_pitch = dst_shape[1] > 1 ? dst_shape[0] : 0; + size_t dst_slice_pitch = dst_shape[2] > 1 ? dst_shape[0] * dst_shape[1] : 0; + + if (src_shape[2] > 1) { CUDA_MEMCPY3D copyParams = { 0 }; - copyParams.dstMemoryType = CU_MEMORYTYPE_DEVICE; // Source memory type. - copyParams.dstDevice = reinterpret_cast(*dst_data_ptr); - copyParams.dstXInBytes = origin[0] * bytes; - copyParams.dstY = origin[1]; - copyParams.dstZ = origin[2]; + copyParams.dstMemoryType = CU_MEMORYTYPE_DEVICE; + copyParams.dstDevice = reinterpret_cast(*dst_ptr); + copyParams.dstXInBytes = dst_origin[0]; + copyParams.dstY = dst_origin[1]; + copyParams.dstPitch = dst_row_pitch; - copyParams.srcMemoryType = CU_MEMORYTYPE_DEVICE; // Destination memory type. - copyParams.srcDevice = reinterpret_cast(*src_data_ptr); - copyParams.srcXInBytes = origin[0] * bytes; - copyParams.srcY = origin[1]; - copyParams.srcZ = origin[2]; + copyParams.srcMemoryType = CU_MEMORYTYPE_DEVICE; + copyParams.srcDevice = reinterpret_cast(*src_ptr); + copyParams.srcXInBytes = src_origin[0]; + copyParams.srcY = src_origin[1]; + copyParams.srcPitch = src_row_pitch; - copyParams.WidthInBytes = region[0] * bytes; + copyParams.WidthInBytes = region[0]; copyParams.Height = region[1]; copyParams.Depth = region[2]; err = cuMemcpy3D(©Params); } - else if (region[1] > 1) + else if (src_shape[1] > 1) { CUDA_MEMCPY2D copyParams = { 0 }; - copyParams.dstMemoryType = CU_MEMORYTYPE_DEVICE; // Source memory type. - copyParams.dstDevice = reinterpret_cast(*dst_data_ptr); - copyParams.dstXInBytes = origin[0] * bytes; - copyParams.dstY = origin[1]; + copyParams.dstMemoryType = CU_MEMORYTYPE_DEVICE; + copyParams.dstDevice = reinterpret_cast(*dst_ptr); + copyParams.dstXInBytes = dst_origin[0]; + copyParams.dstY = dst_origin[1]; + copyParams.dstPitch = dst_row_pitch; - copyParams.srcMemoryType = CU_MEMORYTYPE_DEVICE; // Destination memory type. - copyParams.srcDevice = reinterpret_cast(*src_data_ptr); - copyParams.srcXInBytes = origin[0] * bytes; - copyParams.srcY = origin[1]; + copyParams.srcMemoryType = CU_MEMORYTYPE_DEVICE; + copyParams.srcDevice = reinterpret_cast(*src_ptr); + copyParams.srcXInBytes = src_origin[0]; + copyParams.srcY = src_origin[1]; + copyParams.srcPitch = src_row_pitch; - copyParams.WidthInBytes = region[0] * bytes; + copyParams.WidthInBytes = region[0]; copyParams.Height = region[1]; err = cuMemcpy2D(©Params); } else { - auto dst_ptr = reinterpret_cast(*dst_data_ptr) + (origin[0] * bytes); - CUdeviceptr src_ptr = reinterpret_cast(*src_data_ptr) + (origin[0] * bytes); - err = cuMemcpy((CUdeviceptr)dst_ptr, src_ptr, region[0] * bytes); + auto new_dst_ptr = reinterpret_cast(reinterpret_cast(*dst_ptr) + dst_origin[0]); + auto new_src_ptr = reinterpret_cast(reinterpret_cast(*src_ptr) + src_origin[0]); + err = cuMemcpy(new_dst_ptr, new_src_ptr, region[0]); } if (err != CUDA_SUCCESS) { - throw std::runtime_error("Error (cuda): Failed to copy device memory (buffer -> buffer) with error code " + - std::to_string(err)); + throw std::runtime_error("Error: Fail to copy memory from buffer to buffer.\nCUDA error : " + getErrorString(err) + + " (" + std::to_string(err) + ")."); } #else throw std::runtime_error("Error: CUDA is not enabled"); @@ -515,45 +546,54 @@ CUDABackend::copyMemoryBufferToBuffer(const Device::Pointer & device, } auto -CUDABackend::copyMemoryImageToBuffer(const Device::Pointer & device, - const void ** src_data_ptr, - const std::array & region, - const std::array & origin, - const size_t & bytes, - void ** dst_data_ptr) const -> void +CUDABackend::copyMemoryImageToBuffer(const Device::Pointer & device, + const void ** src_ptr, + std::array & src_origin, + std::array & src_shape, + void ** dst_ptr, + std::array & dst_origin, + std::array & dst_shape, + std::array & region, + const size_t & bytes) const -> void { #if USE_CUDA - copyMemoryBufferToBuffer(device, src_data_ptr, region, origin, bytes, dst_data_ptr); + copyMemoryBufferToBuffer(device, src_ptr, src_origin, src_shape, dst_ptr, dst_origin, dst_shape, region, bytes); #else throw std::runtime_error("Error: CUDA is not enabled"); #endif } auto -CUDABackend::copyMemoryBufferToImage(const Device::Pointer & device, - const void ** src_data_ptr, - const std::array & region, - const std::array & origin, - const size_t & bytes, - void ** dst_data_ptr) const -> void +CUDABackend::copyMemoryBufferToImage(const Device::Pointer & device, + const void ** src_ptr, + std::array & src_origin, + std::array & src_shape, + void ** dst_ptr, + std::array & dst_origin, + std::array & dst_shape, + std::array & region, + const size_t & bytes) const -> void { #if USE_CUDA - copyMemoryBufferToBuffer(device, src_data_ptr, region, origin, bytes, dst_data_ptr); + copyMemoryBufferToBuffer(device, src_ptr, src_origin, src_shape, dst_ptr, dst_origin, dst_shape, region, bytes); #else throw std::runtime_error("Error: CUDA is not enabled"); #endif } auto -CUDABackend::copyMemoryImageToImage(const Device::Pointer & device, - const void ** src_data_ptr, - const std::array & region, - const std::array & origin, - const size_t & bytes, - void ** dst_data_ptr) const -> void +CUDABackend::copyMemoryImageToImage(const Device::Pointer & device, + const void ** src_ptr, + std::array & src_origin, + std::array & src_shape, + void ** dst_ptr, + std::array & dst_origin, + std::array & dst_shape, + std::array & region, + const size_t & bytes) const -> void { #if USE_CUDA - copyMemoryBufferToBuffer(device, src_data_ptr, region, origin, bytes, dst_data_ptr); + copyMemoryBufferToBuffer(device, src_ptr, src_origin, src_shape, dst_ptr, dst_origin, dst_shape, region, bytes); #else throw std::runtime_error("Error: CUDA is not enabled"); #endif @@ -573,7 +613,8 @@ CUDABackend::setBuffer(const Device::Pointer & device, auto err = cuCtxSetCurrent(cuda_device->getCUDAContext()); if (err != CUDA_SUCCESS) { - throw std::runtime_error("Error (cuda): Failed to get context from device (" + std::to_string(err) + ")."); + throw std::runtime_error("Error: Fail to get context from device.\nCUDA error : " + getErrorString(err) + " (" + + std::to_string(err) + ")."); } const auto count = region[0] * region[1] * region[2]; const auto dev_ptr = reinterpret_cast(*buffer_ptr); @@ -630,7 +671,8 @@ CUDABackend::setBuffer(const Device::Pointer & device, } if (err != CUDA_SUCCESS) { - throw std::runtime_error("Error (cuda): Failed to set memory with error code " + std::to_string(err)); + throw std::runtime_error("Error: Fail to fill memory.\nCUDA error : " + getErrorString(err) + " (" + + std::to_string(err) + ")."); } #else throw std::runtime_error("Error: CUDA is not enabled"); @@ -708,12 +750,10 @@ CUDABackend::buildKernel(const Device::Pointer & device, auto err = cuCtxSetCurrent(cuda_device->getCUDAContext()); if (err != CUDA_SUCCESS) { - throw std::runtime_error("Error (cuda): Failed to set CUDA device before memory allocation."); + throw std::runtime_error("Error: Fail to get context from device.\nCUDA error : " + getErrorString(err) + " (" + + std::to_string(err) + ")."); } - std::chrono::high_resolution_clock::time_point start_time, end_time; - std::chrono::microseconds duration; - CUmodule cuModule = nullptr; std::string hash = std::to_string(std::hash{}(kernel_source)); loadProgramFromCache(device, hash, &cuModule); @@ -723,8 +763,8 @@ CUDABackend::buildKernel(const Device::Pointer & device, auto res = nvrtcCreateProgram(&prog, kernel_source.c_str(), nullptr, 0, nullptr, nullptr); if (res != NVRTC_SUCCESS) { - throw std::runtime_error("Error (cuda): Failed to create program from source with error code " + - std::to_string(res)); + throw std::runtime_error("Error: Fail to create kernel program from source.\nCUDA error : " + + getErrorString(res) + " (" + std::to_string(res) + ")."); } const std::string arch_comp = "--gpu-architecture=compute_" + cuda_device->getArch(); @@ -738,7 +778,8 @@ CUDABackend::buildKernel(const Device::Pointer & device, std::string log(log_size, '\0'); nvrtcGetProgramLog(prog, &log[0]); std::cerr << "Build log: " << log << std::endl; - throw std::runtime_error("Error (cuda): Failed to build program with error code " + std::to_string(res)); + throw std::runtime_error("Error: Fail to build kernel program.\nCUDA error : " + getErrorString(res) + " (" + + std::to_string(res) + ")."); } size_t ptxSize; nvrtcGetPTXSize(prog, &ptxSize); @@ -747,13 +788,15 @@ CUDABackend::buildKernel(const Device::Pointer & device, res = nvrtcDestroyProgram(&prog); if (res != NVRTC_SUCCESS) { - throw std::runtime_error("Error (cuda): Failed to destroy program with error code " + std::to_string(res)); + throw std::runtime_error("Error: Fail to destroy kernel program.\nCUDA error : " + getErrorString(res) + " (" + + std::to_string(res) + ")."); } err = cuModuleLoadData(&cuModule, ptx.data()); if (err != CUDA_SUCCESS) { - throw std::runtime_error("Error (cuda): Loading module with error code " + std::to_string(err)); + throw std::runtime_error("Error: Fail to load module.\nCUDA error : " + getErrorString(err) + " (" + + std::to_string(err) + ")."); } @@ -763,7 +806,8 @@ CUDABackend::buildKernel(const Device::Pointer & device, err = cuModuleGetFunction(&cuFunction, cuModule, kernel_name.c_str()); if (err != CUDA_SUCCESS) { - throw std::runtime_error("Error (cuda): Getting function from module with error code " + std::to_string(err)); + throw std::runtime_error("Error: Fail to build function from module.\nCUDA error : " + getErrorString(err) + " (" + + std::to_string(err) + ")."); } *(reinterpret_cast(kernel)) = cuFunction; #else @@ -784,18 +828,11 @@ CUDABackend::executeKernel(const Device::Pointer & device, auto err = cuCtxSetCurrent(cuda_device->getCUDAContext()); if (err != CUDA_SUCCESS) { - throw std::runtime_error("Error (cuda): Failed to set CUDA device before memory allocation."); + throw std::runtime_error("Error: Fail to get context from device."); } CUfunction cuFunction; - try - { - buildKernel(device, kernel_source, kernel_name, &cuFunction); - } - catch (const std::exception & e) - { - throw std::runtime_error("Error (cuda): Failed to build kernel. \n\t > " + std::string(e.what())); - } + buildKernel(device, kernel_source, kernel_name, &cuFunction); std::vector argsValues(args.size()); argsValues = args; @@ -851,7 +888,8 @@ CUDABackend::executeKernel(const Device::Pointer & device, if (err != CUDA_SUCCESS) { - throw std::runtime_error("Error (cuda): Failed launching kernel with error code " + std::to_string(err)); + throw std::runtime_error("Error: Fail launching kernel.\nCUDA error : " + getErrorString(err) + " (" + + std::to_string(err) + ")."); } cuda_device->finish(); #else diff --git a/clic/src/cudadevice.cpp b/clic/src/cudadevice.cpp index 4b5648a4e..064843308 100644 --- a/clic/src/cudadevice.cpp +++ b/clic/src/cudadevice.cpp @@ -30,7 +30,6 @@ CUDADevice::initialize() -> void { if (isInitialized()) { - // std::cerr << "CUDA device already initialized" << std::endl; return; } auto err = cuDeviceGet(&cudaDevice, 0); @@ -161,7 +160,7 @@ CUDADevice::getInfo() const -> std::string } auto -CUDADevice::getCache() -> std::map & +CUDADevice::getCache() -> CUDADevice::CacheType & { return this->cache; } diff --git a/clic/src/execution.cpp b/clic/src/execution.cpp index 24dc90216..27efab162 100644 --- a/clic/src/execution.cpp +++ b/clic/src/execution.cpp @@ -1,15 +1,13 @@ #include "execution.hpp" #include "backend.hpp" - -#include -#include -#include -#include +#include "clic.hpp" namespace cle { -auto +// Function for translating OpenCL code to CUDA code +// @StRigaud TODO: function is not exhaustive and needs to be improved to support more features +static auto translateOpenclToCuda(std::string & code) -> void { // nested lambda function to find and replace all occurrences of a string @@ -29,11 +27,12 @@ translateOpenclToCuda(std::string & code) -> void }; // list of replacements to be performed (not exhaustive) + // special case: 'make_' need to followed by ');' replacement, e.g. (int2){1,2}; -> make_int2(1,2); const std::vector> replacements = { - { "(int2){", "make_int2(" }, // special case - need to followed by ');' replacement - { "(int4){", "make_int4(" }, // special case - need to followed by ');' replacement - { "(float4){", "make_float4(" }, // special case - need to followed by ');' replacement - { "(float2){", "make_float2(" }, // special case - need to followed by ');' replacement + { "(int2){", "make_int2(" }, + { "(int4){", "make_int4(" }, + { "(float4){", "make_float4(" }, + { "(float2){", "make_float2(" }, { "__constant sampler_t", "__device__ int" }, { "inline", "__device__ inline" }, { "#pragma", "// #pragma" }, @@ -50,8 +49,10 @@ translateOpenclToCuda(std::string & code) -> void } } -auto -cudaDefines(const ParameterList & parameter_list, const ConstantList & constant_list) -> std::string + +// Function creating common defines for constants +static auto +commonDefines(const ConstantList & constant_list) -> std::string { std::ostringstream defines; for (const auto & [key, value] : constant_list) @@ -59,65 +60,58 @@ cudaDefines(const ParameterList & parameter_list, const ConstantList & constant_ defines << "#define " << key << " " << value << "\n"; } defines << "\n"; - for (const auto & param : parameter_list) - { - if (std::holds_alternative(param.second) || std::holds_alternative(param.second)) - { - continue; - } - const auto & arr = std::get(param.second); + defines << "\n#define GET_IMAGE_WIDTH(image_key) IMAGE_SIZE_ ## image_key ## _WIDTH"; + defines << "\n#define GET_IMAGE_HEIGHT(image_key) IMAGE_SIZE_ ## image_key ## _HEIGHT"; + defines << "\n#define GET_IMAGE_DEPTH(image_key) IMAGE_SIZE_ ## image_key ## _DEPTH"; + defines << "\n"; + return defines.str(); +} - // Function to format and append the define string - static constexpr std::array ndimMap = { "1", "2", "3" }; - static constexpr std::array posTypeMap = { "int", "int2", "int4" }; - static constexpr std::array posMap = { "(pos0)", "(pos0, pos1)", "(pos0, pos1, pos2, 0)" }; - int dim = arr->dim(); - std::string ndim = ndimMap[dim - 1]; - std::string pos_type = posTypeMap[dim - 1]; - std::string pos = posMap[dim - 1]; - if (pos_type == "int") - { - defines << "\n#define POS_" << param.first << "_INSTANCE(pos0,pos1,pos2,pos3) " << pos; - } - else - { - defines << "\n#define POS_" << param.first << "_INSTANCE(pos0,pos1,pos2,pos3) make_" << pos_type << "" << pos; - } +// Function creating buffer specific defines +static auto +bufferDefines(std::ostringstream & defines, + const std::string & key, + const std::string & ndim, + const std::string & dtype, + const std::string & stype, + const std::string & ocl) -> void +{ + defines << "\n#define IMAGE_" << key << "_TYPE " << ocl << dtype << "*"; + defines << "\n#define READ_" << key << "_IMAGE(a,b,c) read_buffer" << ndim << "d" << stype + << "(GET_IMAGE_WIDTH(a),GET_IMAGE_HEIGHT(a),GET_IMAGE_DEPTH(a),a,b,c)"; + defines << "\n#define WRITE_" << key << "_IMAGE(a,b,c) write_buffer" << ndim << "d" << stype + << "(GET_IMAGE_WIDTH(a),GET_IMAGE_HEIGHT(a),GET_IMAGE_DEPTH(a),a,b,c)"; +} - defines << "\n"; - defines << "\n#define CONVERT_" << param.first << "_PIXEL_TYPE clij_convert_" << arr->dtype() << "_sat"; - defines << "\n#define IMAGE_" << param.first << "_PIXEL_TYPE " << arr->dtype() << ""; - defines << "\n#define POS_" << param.first << "_TYPE " << pos_type; - defines << "\n\n"; - defines << "\n#define IMAGE_SIZE_" << param.first << "_WIDTH " << std::to_string(arr->width()); - defines << "\n#define IMAGE_SIZE_" << param.first << "_HEIGHT " << std::to_string(arr->height()); - defines << "\n#define IMAGE_SIZE_" << param.first << "_DEPTH " << std::to_string(arr->depth()); - defines << "\n\n"; - defines << "\n#define IMAGE_" << param.first << "_TYPE " << arr->dtype() << "*"; - defines << "\n#define READ_" << param.first << "_IMAGE(a,b,c) read_buffer" << ndim << "d" << arr->shortType() - << "(GET_IMAGE_WIDTH(a),GET_IMAGE_HEIGHT(a),GET_IMAGE_DEPTH(a),a,b,c)"; - defines << "\n#define WRITE_" << param.first << "_IMAGE(a,b,c) write_buffer" << ndim << "d" << arr->shortType() - << "(GET_IMAGE_WIDTH(a),GET_IMAGE_HEIGHT(a),GET_IMAGE_DEPTH(a),a,b,c)"; - defines << "\n"; - } - defines << "\n"; - return defines.str(); + +// Function creating image specific defines (OpenCL only for now) +static auto +imageDefines(std::ostringstream & defines, + const std::string & key, + const std::string & ndim, + const std::string & stype, + const std::string & access_type) -> void +{ + char front_char = stype.front(); + std::string prefix = (front_char == 'u') ? "ui" : (front_char == 'f') ? "f" : "i"; + std::string img_type_name = access_type + " image" + ndim + "d_t"; + defines << "\n#define IMAGE_" << key << "_TYPE " << img_type_name; + defines << "\n#define READ_" << key << "_IMAGE(a,b,c) read_image" << prefix << "(a,b,c)"; + defines << "\n#define WRITE_" << key << "_IMAGE(a,b,c) write_image" << prefix << "(a,b,c)"; } -auto -oclDefines(const ParameterList & parameter_list, const ConstantList & constant_list) -> std::string + +// Function for creating defines for each array parameters +static auto +arrayDefines(const ParameterList & parameter_list, const Device::Type & device) -> std::string { - std::ostringstream defines; - for (const auto & [key, value] : constant_list) - { - defines << "#define " << key << " " << value << "\n"; - } - defines << "\n"; - defines << "\n#define GET_IMAGE_WIDTH(image_key) IMAGE_SIZE_ ## image_key ## _WIDTH"; - defines << "\n#define GET_IMAGE_HEIGHT(image_key) IMAGE_SIZE_ ## image_key ## _HEIGHT"; - defines << "\n#define GET_IMAGE_DEPTH(image_key) IMAGE_SIZE_ ## image_key ## _DEPTH"; - defines << "\n"; + std::ostringstream defines; + static constexpr std::array ndimMap = { "1", "2", "3" }; + static constexpr std::array posTypeMap = { "int", "int2", "int4" }; + static constexpr std::array posMap = { "(pos0)", "(pos0, pos1)", "(pos0, pos1, pos2, 0)" }; + + // loop over all parameters, skip if parameter is not an array for (const auto & param : parameter_list) { if (std::holds_alternative(param.second) || std::holds_alternative(param.second)) @@ -125,35 +119,29 @@ oclDefines(const ParameterList & parameter_list, const ConstantList & constant_l continue; } const auto & arr = std::get(param.second); + const auto & key = param.first; - static constexpr std::array ndimMap = { "1", "2", "3" }; - static constexpr std::array posTypeMap = { "int", "int2", "int4" }; - static constexpr std::array posMap = { "(pos0)", "(pos0, pos1)", "(pos0, pos1, pos2, 0)" }; - - const int dimIndex = arr->dim() - 1; + // manage array dimension + const size_t dimIndex = arr->dim() - 1; const std::string ndim = ndimMap[dimIndex]; const std::string pos_type = posTypeMap[dimIndex]; const std::string pos = posMap[dimIndex]; - + defines << "\n#define CONVERT_" << key << "_PIXEL_TYPE clij_convert_" << arr->dtype() << "_sat"; + defines << "\n#define IMAGE_" << key << "_PIXEL_TYPE " << arr->dtype(); + defines << "\n#define POS_" << key << "_TYPE " << pos_type; + const std::string prefix = + (device == Device::Type::OPENCL || pos_type == "int") ? "(" + pos_type + ")" : "make_" + pos_type; + defines << "\n#define POS_" << param.first << "_INSTANCE(pos0,pos1,pos2,pos3) " << prefix << pos; defines << "\n"; - defines << "\n#define CONVERT_" << param.first << "_PIXEL_TYPE clij_convert_" << arr->dtype() << "_sat"; - defines << "\n#define IMAGE_" << param.first << "_PIXEL_TYPE " << arr->dtype() << ""; - defines << "\n#define POS_" << param.first << "_TYPE " << pos_type; - defines << "\n#define POS_" << param.first << "_INSTANCE(pos0,pos1,pos2,pos3) (" << pos_type << ")" << pos; - defines << "\n"; - if (arr->mtype() == mType::BUFFER) + + // manage array type (buffer or image), and read/write macros + if (arr->mtype() == mType::BUFFER || device == Device::Type::CUDA) { - defines << "\n#define IMAGE_" << param.first << "_TYPE __global " << arr->dtype() << "*"; - defines << "\n#define READ_" << param.first << "_IMAGE(a,b,c) read_buffer" << ndim << "d" << arr->shortType() - << "(GET_IMAGE_WIDTH(a),GET_IMAGE_HEIGHT(a),GET_IMAGE_DEPTH(a),a,b,c)"; - defines << "\n#define WRITE_" << param.first << "_IMAGE(a,b,c) write_buffer" << ndim << "d" << arr->shortType() - << "(GET_IMAGE_WIDTH(a),GET_IMAGE_HEIGHT(a),GET_IMAGE_DEPTH(a),a,b,c)"; + std::string ocl_keyword = (device == Device::Type::OPENCL) ? "__global " : ""; + bufferDefines(defines, key, ndim, toString(arr->dtype()), arr->shortType(), ocl_keyword); } else { - char front_char = arr->shortType().front(); - std::string prefix = (front_char == 'u') ? "ui" : (front_char == 'f') ? "f" : "i"; - std::string access_type; if (param.first.find("dst") != std::string::npos || param.first.find("destination") != std::string::npos || param.first.find("output") != std::string::npos) @@ -164,22 +152,32 @@ oclDefines(const ParameterList & parameter_list, const ConstantList & constant_l { access_type = "__read_only"; } - std::string img_type_name = access_type + " image" + ndim + "d_t"; - - defines << "\n#define IMAGE_" << param.first << "_TYPE " << img_type_name; - defines << "\n#define READ_" << param.first << "_IMAGE(a,b,c) read_image" << prefix << "(a,b,c)"; - defines << "\n#define WRITE_" << param.first << "_IMAGE(a,b,c) write_image" << prefix << "(a,b,c)"; + imageDefines(defines, key, ndim, arr->shortType(), access_type); } + + // manage array size defines << "\n"; - defines << "\n#define IMAGE_SIZE_" << param.first << "_WIDTH " << std::to_string(arr->width()); - defines << "\n#define IMAGE_SIZE_" << param.first << "_HEIGHT " << std::to_string(arr->height()); - defines << "\n#define IMAGE_SIZE_" << param.first << "_DEPTH " << std::to_string(arr->depth()); - defines << "\n"; + defines << "\n#define IMAGE_SIZE_" << key << "_WIDTH " << std::to_string(arr->width()); + defines << "\n#define IMAGE_SIZE_" << key << "_HEIGHT " << std::to_string(arr->height()); + defines << "\n#define IMAGE_SIZE_" << key << "_DEPTH " << std::to_string(arr->depth()); + defines << "\n\n"; } - defines << "\n"; return defines.str(); } + +// Top function for creating defines at runtime +auto +generateDefines(const ParameterList & parameter_list, const ConstantList & constant_list, const Device::Type & device) + -> std::string +{ + std::ostringstream defines; + defines << commonDefines(constant_list); + defines << arrayDefines(parameter_list, device); + return defines.str(); +} + + auto execute(const Device::Pointer & device, const KernelInfo & kernel_func, @@ -188,21 +186,13 @@ execute(const Device::Pointer & device, const ConstantList & constants) -> void { // prepare kernel source for compilation and execution - auto kernel_source = kernel_func.second; - const auto kernel_name = kernel_func.first; - const auto kernel_preamble = cle::BackendManager::getInstance().getBackend().getPreamble(); - std::string defines; - switch (device->getType()) + auto kernel_source = kernel_func.second; + const auto kernel_name = kernel_func.first; + const auto kernel_preamble = cle::BackendManager::getInstance().getBackend().getPreamble(); + const auto defines = generateDefines(parameters, constants, device->getType()); + if (device->getType() == Device::Type::CUDA) { - case Device::Type::CUDA: { - defines = cle::cudaDefines(parameters, constants); - cle::translateOpenclToCuda(kernel_source); - break; - } - case Device::Type::OPENCL: { - defines = cle::oclDefines(parameters, constants); - break; - } + cle::translateOpenclToCuda(kernel_source); } const std::string program_source = defines + kernel_preamble + kernel_source; @@ -211,18 +201,12 @@ execute(const Device::Pointer & device, std::vector args_size; args_ptr.reserve(parameters.size()); args_size.reserve(parameters.size()); - -#if USE_OPENCL - const constexpr size_t size_of_pointer = sizeof(cl_mem); -#else - const constexpr size_t size_of_pointer = sizeof(void *); -#endif for (const auto & param : parameters) { if (const auto & arr = std::get_if(¶m.second)) { args_ptr.push_back(device->getType() == Device::Type::CUDA ? (*arr)->get() : *(*arr)->get()); - args_size.push_back(size_of_pointer); + args_size.push_back(GPU_MEM_PTR_SIZE); } else if (const auto & f = std::get_if(¶m.second)) { @@ -240,18 +224,12 @@ execute(const Device::Pointer & device, } } - // execute kernel in backend - try - { - cle::BackendManager::getInstance().getBackend().executeKernel( - device, program_source, kernel_name, global_range, args_ptr, args_size); - } - catch (const std::exception & e) - { - throw std::runtime_error("Error: Failed to execute the kernel. \n\t > " + std::string(e.what())); - } + // execute kernel + cle::BackendManager::getInstance().getBackend().executeKernel( + device, program_source, kernel_name, global_range, args_ptr, args_size); } + auto native_execute(const Device::Pointer & device, const KernelInfo & kernel_func, @@ -261,7 +239,7 @@ native_execute(const Device::Pointer & device, { // TODO @StRigaud: Implement native execution for OpenCL and CUDA // allows execution of pure CUDA or OpenCL code without CLIJ syntax - throw std::runtime_error("Error: Native execution is not implemented yet."); + throw std::runtime_error("WIP: Native execution is not implemented yet."); } } // namespace cle diff --git a/clic/src/openclbackend.cpp b/clic/src/openclbackend.cpp index 5b54b2679..7a3018296 100644 --- a/clic/src/openclbackend.cpp +++ b/clic/src/openclbackend.cpp @@ -1,11 +1,69 @@ #include "backend.hpp" #include "cle_preamble_cl.h" -#include +#include namespace cle { +#if USE_OPENCL +[[nodiscard]] static auto +getErrorString(const cl_int & error) -> std::string +{ + static const std::unordered_map openCLErrorToStr = { + { CL_SUCCESS, "CL_SUCCESS" }, + { CL_DEVICE_NOT_FOUND, "CL_DEVICE_NOT_FOUND" }, + { CL_DEVICE_NOT_AVAILABLE, "CL_DEVICE_NOT_AVAILABLE" }, + { CL_COMPILER_NOT_AVAILABLE, "CL_COMPILER_NOT_AVAILABLE" }, + { CL_MEM_OBJECT_ALLOCATION_FAILURE, "CL_MEM_OBJECT_ALLOCATION_FAILURE" }, + { CL_OUT_OF_RESOURCES, "CL_OUT_OF_RESOURCES" }, + { CL_OUT_OF_HOST_MEMORY, "CL_OUT_OF_HOST_MEMORY" }, + { CL_PROFILING_INFO_NOT_AVAILABLE, "CL_PROFILING_INFO_NOT_AVAILABLE" }, + { CL_MEM_COPY_OVERLAP, "CL_MEM_COPY_OVERLAP" }, + { CL_IMAGE_FORMAT_MISMATCH, "CL_IMAGE_FORMAT_MISMATCH" }, + { CL_IMAGE_FORMAT_NOT_SUPPORTED, "CL_IMAGE_FORMAT_NOT_SUPPORTED" }, + { CL_BUILD_PROGRAM_FAILURE, "CL_BUILD_PROGRAM_FAILURE" }, + { CL_MAP_FAILURE, "CL_MAP_FAILURE" }, + { CL_INVALID_VALUE, "CL_INVALID_VALUE" }, + { CL_INVALID_DEVICE_TYPE, "CL_INVALID_DEVICE_TYPE" }, + { CL_INVALID_PLATFORM, "CL_INVALID_PLATFORM" }, + { CL_INVALID_DEVICE, "CL_INVALID_DEVICE" }, + { CL_INVALID_CONTEXT, "CL_INVALID_CONTEXT" }, + { CL_INVALID_QUEUE_PROPERTIES, "CL_INVALID_QUEUE_PROPERTIES" }, + { CL_INVALID_COMMAND_QUEUE, "CL_INVALID_COMMAND_QUEUE" }, + { CL_INVALID_HOST_PTR, "CL_INVALID_HOST_PTR" }, + { CL_INVALID_MEM_OBJECT, "CL_INVALID_MEM_OBJECT" }, + { CL_INVALID_IMAGE_FORMAT_DESCRIPTOR, "CL_INVALID_IMAGE_FORMAT_DESCRIPTOR" }, + { CL_INVALID_IMAGE_SIZE, "CL_INVALID_IMAGE_SIZE" }, + { CL_INVALID_SAMPLER, "CL_INVALID_SAMPLER" }, + { CL_INVALID_BINARY, "CL_INVALID_BINARY" }, + { CL_INVALID_BUILD_OPTIONS, "CL_INVALID_BUILD_OPTIONS" }, + { CL_INVALID_PROGRAM, "CL_INVALID_PROGRAM" }, + { CL_INVALID_PROGRAM_EXECUTABLE, "CL_INVALID_PROGRAM_EXECUTABLE" }, + { CL_INVALID_KERNEL_NAME, "CL_INVALID_KERNEL_NAME" }, + { CL_INVALID_KERNEL_DEFINITION, "CL_INVALID_KERNEL_DEFINITION" }, + { CL_INVALID_KERNEL, "CL_INVALID_KERNEL" }, + { CL_INVALID_ARG_INDEX, "CL_INVALID_ARG_INDEX" }, + { CL_INVALID_ARG_VALUE, "CL_INVALID_ARG_VALUE" }, + { CL_INVALID_ARG_SIZE, "CL_INVALID_ARG_SIZE" }, + { CL_INVALID_KERNEL_ARGS, "CL_INVALID_KERNEL_ARGS" }, + { CL_INVALID_WORK_DIMENSION, "CL_INVALID_WORK_DIMENSION" }, + { CL_INVALID_WORK_GROUP_SIZE, "CL_INVALID_WORK_GROUP_SIZE" }, + { CL_INVALID_WORK_ITEM_SIZE, "CL_INVALID_WORK_ITEM_SIZE" }, + { CL_INVALID_GLOBAL_OFFSET, "CL_INVALID_GLOBAL_OFFSET" }, + { CL_INVALID_EVENT_WAIT_LIST, "CL_INVALID_EVENT_WAIT_LIST" }, + { CL_INVALID_EVENT, "CL_INVALID_EVENT" }, + { CL_INVALID_OPERATION, "CL_INVALID_OPERATION" }, + { CL_INVALID_GL_OBJECT, "CL_INVALID_GL_OBJECT" }, + { CL_INVALID_BUFFER_SIZE, "CL_INVALID_BUFFER_SIZE" }, + { CL_INVALID_MIP_LEVEL, "CL_INVALID_MIP_LEVEL" }, + { CL_INVALID_GLOBAL_WORK_SIZE, "CL_INVALID_GLOBAL_WORK_SIZE" }, + }; + auto ite = openCLErrorToStr.find(error); + return (ite != openCLErrorToStr.end()) ? ite->second : "CL_UNKNOWN_ERROR"; +} +#endif + auto OpenCLBackend::getDevices(const std::string & type) const -> std::vector { @@ -16,7 +74,7 @@ OpenCLBackend::getDevices(const std::string & type) const -> std::vector platformIds(platformCount); clGetPlatformIDs(platformCount, platformIds.data(), nullptr); @@ -58,7 +116,7 @@ OpenCLBackend::getDevices(const std::string & type) const -> std::vectorgetCLContext(), CL_MEM_READ_WRITE, size, nullptr, &err); if (err != CL_SUCCESS) { - throw std::runtime_error("Error (ocl): Failed to allocate memory (buffer) with error code " + std::to_string(err)); + throw std::runtime_error("Error: Fail to allocate buffer memory.\nOpenCL error : " + getErrorString(err) + " (" + + std::to_string(err) + ")."); } *data_ptr = static_cast(new cl_mem(mem)); #else @@ -216,7 +275,8 @@ OpenCLBackend::allocateImage(const Device::Pointer & device, clCreateImage(opencl_device->getCLContext(), CL_MEM_READ_WRITE, &image_format, &image_description, nullptr, &err); if (err != CL_SUCCESS) { - throw std::runtime_error("Error (ocl): Failed to allocate memory (image) with error code " + std::to_string(err)); + throw std::runtime_error("Error: Fail to allocate image memory.\nOpenCL error : " + getErrorString(err) + " (" + + std::to_string(err) + ")."); } *data_ptr = static_cast(new cl_mem(image)); #else @@ -232,7 +292,8 @@ OpenCLBackend::freeMemory(const Device::Pointer & device, const mType & mtype, v auto err = clReleaseMemObject(*cl_mem_ptr); if (err != CL_SUCCESS) { - throw std::runtime_error("Error (ocl): Failed to free memory with error code " + std::to_string(err)); + throw std::runtime_error("Error: Fail to free memory.\nOpenCL error : " + getErrorString(err) + " (" + + std::to_string(err) + ")."); } #else throw std::runtime_error("Error: OpenCL is not enabled"); @@ -288,7 +349,8 @@ OpenCLBackend::writeBuffer(const Device::Pointer & device, } if (err != CL_SUCCESS) { - throw std::runtime_error("Error (ocl): Failed to write memory (buffer) with error code " + std::to_string(err)); + throw std::runtime_error("Error: Fail to write buffer memory.\nOpenCL error : " + getErrorString(err) + " (" + + std::to_string(err) + ")."); } #else throw std::runtime_error("Error: OpenCL is not enabled"); @@ -322,7 +384,8 @@ OpenCLBackend::writeImage(const Device::Pointer & device, nullptr); if (err != CL_SUCCESS) { - throw std::runtime_error("Error (ocl): Failed to write memory (image) with error code " + std::to_string(err)); + throw std::runtime_error("Error: Fail to write image memory.\nOpenCL error : " + getErrorString(err) + " (" + + std::to_string(err) + ")."); } #else throw std::runtime_error("Error: OpenCL is not enabled"); @@ -405,7 +468,8 @@ OpenCLBackend::readBuffer(const Device::Pointer & device, } if (err != CL_SUCCESS) { - throw std::runtime_error("Error (ocl): Failed to read memory (buffer) with error code " + std::to_string(err)); + throw std::runtime_error("Error: Fail to read buffer memory.\nOpenCL error : " + getErrorString(err) + " (" + + std::to_string(err) + ")."); } #else throw std::runtime_error("Error: OpenCL is not enabled"); @@ -438,7 +502,8 @@ OpenCLBackend::readImage(const Device::Pointer & device, nullptr); if (err != CL_SUCCESS) { - throw std::runtime_error("Error (ocl): Failed to read memory (image) with error code " + std::to_string(err)); + throw std::runtime_error("Error: Fail to read image memory.\nOpenCL error : " + getErrorString(err) + " (" + + std::to_string(err) + ")."); } #else throw std::runtime_error("Error: OpenCL is not enabled"); @@ -476,29 +541,43 @@ OpenCLBackend::readMemory(const Device::Pointer & device, } auto -OpenCLBackend::copyMemoryBufferToBuffer(const Device::Pointer & device, - const void ** src_data_ptr, - const std::array & region, - const std::array & origin, - const size_t & bytes, - void ** dst_data_ptr) const -> void +OpenCLBackend::copyMemoryBufferToBuffer(const Device::Pointer & device, + const void ** src_ptr, + std::array & src_origin, + std::array & src_shape, + void ** dst_ptr, + std::array & dst_origin, + std::array & dst_shape, + std::array & region, + const size_t & bytes) const -> void { #if USE_OPENCL - auto opencl_device = std::dynamic_pointer_cast(device); - cl_int err; - const std::array region_ocl = { region[0] * bytes, region[1], region[2] }; - if (region[2] > 1 || region[1] > 1) + auto opencl_device = std::dynamic_pointer_cast(device); + cl_int err; + + region[0] *= bytes; + src_origin[0] *= bytes; + src_shape[0] *= bytes; + dst_origin[0] *= bytes; + dst_shape[0] *= bytes; + + size_t src_row_pitch = src_shape[1] > 1 ? src_shape[0] : 0; + size_t src_slice_pitch = src_shape[2] > 1 ? src_shape[0] * src_shape[1] : 0; + size_t dst_row_pitch = dst_shape[1] > 1 ? dst_shape[0] : 0; + size_t dst_slice_pitch = dst_shape[2] > 1 ? dst_shape[0] * dst_shape[1] : 0; + + if (src_shape[2] > 1 || src_shape[1] > 1) { err = clEnqueueCopyBufferRect(opencl_device->getCLCommandQueue(), - *static_cast(*src_data_ptr), - *static_cast(*dst_data_ptr), - origin.data(), - origin.data(), - region_ocl.data(), - 0, - 0, - 0, - 0, + *static_cast(*src_ptr), + *static_cast(*dst_ptr), + src_origin.data(), + dst_origin.data(), + region.data(), + src_row_pitch, + src_slice_pitch, + dst_row_pitch, + dst_slice_pitch, 0, nullptr, nullptr); @@ -506,19 +585,19 @@ OpenCLBackend::copyMemoryBufferToBuffer(const Device::Pointer & device, else { err = clEnqueueCopyBuffer(opencl_device->getCLCommandQueue(), - *static_cast(*src_data_ptr), - *static_cast(*dst_data_ptr), - origin[0], - origin[0], - region_ocl[0], + *static_cast(*src_ptr), + *static_cast(*dst_ptr), + src_origin[0], + dst_origin[0], + region[0], 0, nullptr, nullptr); } if (err != CL_SUCCESS) { - throw std::runtime_error("Error (ocl): Failed to copy memory (buffer->buffer) with error code " + - std::to_string(err)); + throw std::runtime_error("Error: Fail to copy memory from buffer to buffer.\nOpenCL error : " + + getErrorString(err) + " (" + std::to_string(err) + ")."); } #else throw std::runtime_error("Error: OpenCL is not enabled"); @@ -526,28 +605,41 @@ OpenCLBackend::copyMemoryBufferToBuffer(const Device::Pointer & device, } auto -OpenCLBackend::copyMemoryBufferToImage(const Device::Pointer & device, - const void ** src_data_ptr, - const std::array & region, - const std::array & origin, - const size_t & bytes, - void ** dst_data_ptr) const -> void +OpenCLBackend::copyMemoryBufferToImage(const Device::Pointer & device, + const void ** src_ptr, + std::array & src_origin, + std::array & src_shape, + void ** dst_ptr, + std::array & dst_origin, + std::array & dst_shape, + std::array & region, + const size_t & bytes) const -> void { #if USE_OPENCL auto opencl_device = std::dynamic_pointer_cast(device); + + region[0] *= bytes; + src_origin[0] *= bytes; + src_shape[0] *= bytes; + dst_origin[0] *= bytes; + + size_t src_row_pitch = src_shape[1] > 1 ? src_shape[0] : 0; + size_t src_slice_pitch = src_shape[2] > 1 ? src_shape[0] * src_shape[1] : 0; + size_t bufferOffset = src_origin[0] + src_origin[1] * src_row_pitch + src_origin[2] * src_slice_pitch; + auto err = clEnqueueCopyBufferToImage(opencl_device->getCLCommandQueue(), - *static_cast(*src_data_ptr), - *static_cast(*dst_data_ptr), - 0, - origin.data(), + *static_cast(*src_ptr), + *static_cast(*dst_ptr), + bufferOffset, + dst_origin.data(), region.data(), 0, nullptr, nullptr); if (err != CL_SUCCESS) { - throw std::runtime_error("Error (ocl): Failed to copy memory (buffer->image) with error code " + - std::to_string(err)); + throw std::runtime_error("Error: Fail to copy memory from buffer to image.\nOpenCL error : " + getErrorString(err) + + " (" + std::to_string(err) + ")."); } #else throw std::runtime_error("Error: OpenCL is not enabled"); @@ -555,29 +647,42 @@ OpenCLBackend::copyMemoryBufferToImage(const Device::Pointer & device, } auto -OpenCLBackend::copyMemoryImageToBuffer(const Device::Pointer & device, - const void ** src_data_ptr, - const std::array & region, - const std::array & origin, - const size_t & bytes, - void ** dst_data_ptr) const -> void +OpenCLBackend::copyMemoryImageToBuffer(const Device::Pointer & device, + const void ** src_ptr, + std::array & src_origin, + std::array & src_shape, + void ** dst_ptr, + std::array & dst_origin, + std::array & dst_shape, + std::array & region, + const size_t & bytes) const -> void { #if USE_OPENCL auto opencl_device = std::dynamic_pointer_cast(device); + + region[0] *= bytes; + src_origin[0] *= bytes; + dst_shape[0] *= bytes; + dst_origin[0] *= bytes; + + size_t dst_row_pitch = dst_shape[1] > 1 ? dst_shape[0] : 0; + size_t dst_slice_pitch = dst_shape[2] > 1 ? dst_shape[0] * dst_shape[1] : 0; + size_t bufferOffset = src_origin[0] + src_origin[1] * dst_row_pitch + src_origin[2] * dst_slice_pitch; + auto err = clEnqueueCopyImageToBuffer(opencl_device->getCLCommandQueue(), - *static_cast(*src_data_ptr), - *static_cast(*dst_data_ptr), - origin.data(), + *static_cast(*src_ptr), + *static_cast(*dst_ptr), + src_origin.data(), region.data(), - 0, + bufferOffset, 0, nullptr, nullptr); if (err != CL_SUCCESS) { - throw std::runtime_error("Error (ocl): Failed to copy memory (image->buffer) with error code " + - std::to_string(err)); + throw std::runtime_error("Error: Fail to copy memory from image to buffer.\nOpenCL error : " + getErrorString(err) + + " (" + std::to_string(err) + ")."); } #else throw std::runtime_error("Error: OpenCL is not enabled"); @@ -585,28 +690,38 @@ OpenCLBackend::copyMemoryImageToBuffer(const Device::Pointer & device, } auto -OpenCLBackend::copyMemoryImageToImage(const Device::Pointer & device, - const void ** src_data_ptr, - const std::array & region, - const std::array & origin, - const size_t & bytes, - void ** dst_data_ptr) const -> void +OpenCLBackend::copyMemoryImageToImage(const Device::Pointer & device, + const void ** src_ptr, + std::array & src_origin, + std::array & src_shape, + void ** dst_ptr, + std::array & dst_origin, + std::array & dst_shape, + std::array & region, + const size_t & bytes) const -> void { #if USE_OPENCL auto opencl_device = std::dynamic_pointer_cast(device); + + region[0] *= bytes; + src_origin[0] *= bytes; + src_shape[0] *= bytes; + dst_origin[0] *= bytes; + dst_shape[0] *= bytes; + auto err = clEnqueueCopyImage(opencl_device->getCLCommandQueue(), - *static_cast(*src_data_ptr), - *static_cast(*dst_data_ptr), - origin.data(), - origin.data(), + *static_cast(*src_ptr), + *static_cast(*dst_ptr), + src_origin.data(), + dst_origin.data(), region.data(), 0, nullptr, nullptr); if (err != CL_SUCCESS) { - throw std::runtime_error("Error (ocl): Failed to copy memory (image->image) with error code " + - std::to_string(err)); + throw std::runtime_error("Error: Fail to copy memory from image to image.\nOpenCL error : " + getErrorString(err) + + " (" + std::to_string(err) + ")."); } #else throw std::runtime_error("Error: OpenCL is not enabled"); @@ -779,7 +894,8 @@ OpenCLBackend::setBuffer(const Device::Pointer & device, if (err != CL_SUCCESS) { - throw std::runtime_error("Error (ocl): Failed to set memory (buffer) with error code " + std::to_string(err) + "."); + throw std::runtime_error("Error: Fail to fill buffer memory.\nOpenCL error : " + getErrorString(err) + " (" + + std::to_string(err) + ")."); } #else throw std::runtime_error("Error: OpenCL is not enabled"); @@ -845,7 +961,8 @@ OpenCLBackend::setImage(const Device::Pointer & device, } if (err != CL_SUCCESS) { - throw std::runtime_error("Error (ocl): Failed to set memory (image) with error code " + std::to_string(err) + "."); + throw std::runtime_error("Error: Fail to fill image memory.\nOpenCL error : " + getErrorString(err) + " (" + + std::to_string(err) + ")."); } #else throw std::runtime_error("Error: OpenCL is not enabled"); @@ -903,8 +1020,8 @@ OpenCLBackend::buildKernel(const Device::Pointer & device, prog = clCreateProgramWithSource(opencl_device->getCLContext(), 1, &source, nullptr, &err); if (err != CL_SUCCESS) { - throw std::runtime_error("Error (ocl): Failed to create program from source with error code " + - std::to_string(err)); + throw std::runtime_error("Error: Fail to create program from source.\nOpenCL error : " + getErrorString(err) + + " (" + std::to_string(err) + ")."); } cl_int buildStatus = clBuildProgram(prog, 1, &opencl_device->getCLDevice(), "-w", nullptr, nullptr); if (buildStatus != CL_SUCCESS) @@ -915,15 +1032,16 @@ OpenCLBackend::buildKernel(const Device::Pointer & device, buffer.resize(len); clGetProgramBuildInfo(prog, opencl_device->getCLDevice(), CL_PROGRAM_BUILD_LOG, len, &buffer[0], &len); std::cerr << "Build log: " << buffer << std::endl; - throw std::runtime_error("Error (ocl): Failed to build program " + kernel_name + " with error code " + - std::to_string(err)); + throw std::runtime_error("Error: Fail to build program " + kernel_name + + ".\nOpenCL error : " + getErrorString(err) + " (" + std::to_string(err) + ")."); } saveProgramToCache(device, hash, &prog); } auto ocl_kernel = clCreateKernel(prog, kernel_name.c_str(), &err); if (err != CL_SUCCESS) { - throw std::runtime_error("Error: Failed to create kernel with error code " + std::to_string(err)); + throw std::runtime_error("Error: Fail to create kernel.\nOpenCL error : " + getErrorString(err) + " (" + + std::to_string(err) + ")."); } *reinterpret_cast(kernel) = ocl_kernel; #else @@ -940,29 +1058,26 @@ OpenCLBackend::executeKernel(const Device::Pointer & device, const std::vector & sizes) const -> void { #if USE_OPENCL - auto opencl_device = std::dynamic_pointer_cast(device); + auto opencl_device = std::dynamic_pointer_cast(device); + cl_kernel ocl_kernel; - try - { - buildKernel(device, kernel_source, kernel_name, &ocl_kernel); - } - catch (const std::exception & e) - { - throw std::runtime_error("Error (ocl): Failed to build kernel. \n\t > " + std::string(e.what())); - } + buildKernel(device, kernel_source, kernel_name, &ocl_kernel); + for (size_t i = 0; i < args.size(); i++) { auto err = clSetKernelArg(ocl_kernel, i, sizes[i], args[i]); if (err != CL_SUCCESS) { - throw std::runtime_error("Error (ocl): Failed to set kernel arguments (" + std::to_string(err) + ").)"); + throw std::runtime_error("Error: Fail to set kernel arguments.\nOpenCL error : " + getErrorString(err) + " (" + + std::to_string(err) + ")."); } } auto err = clEnqueueNDRangeKernel( opencl_device->getCLCommandQueue(), ocl_kernel, 3, nullptr, global_size.data(), nullptr, 0, nullptr, nullptr); if (err != CL_SUCCESS) { - throw std::runtime_error("Error (ocl): Failed to launch kernel (" + std::to_string(err) + ").)"); + throw std::runtime_error("Error: Fail to launch kernel.\nOpenCL error : " + getErrorString(err) + " (" + + std::to_string(err) + ")."); } opencl_device->finish(); #else diff --git a/clic/src/opencldevice.cpp b/clic/src/opencldevice.cpp index a8095d699..fba3cb978 100644 --- a/clic/src/opencldevice.cpp +++ b/clic/src/opencldevice.cpp @@ -162,7 +162,7 @@ OpenCLDevice::getInfo() const -> std::string } auto -OpenCLDevice::getCache() -> std::map & +OpenCLDevice::getCache() -> OpenCLDevice::CacheType & { return this->cache; } diff --git a/tests/array_test.cpp b/tests/array_test.cpp index d8adb4149..bc69d82ef 100644 --- a/tests/array_test.cpp +++ b/tests/array_test.cpp @@ -85,7 +85,6 @@ run_test(const std::array & shape, const cle::mType & mem_type) -> bo // } // std::cout << std::endl; - std::array region = { 6, 3, 1 }; std::vector subtest(region[0] * region[1] * region[2]); gpu_input->read(subtest.data(), region, { 1, 1, 0 }); @@ -99,8 +98,13 @@ run_test(const std::array & shape, const cle::mType & mem_type) -> bo // std::cout << std::endl; gpu_input->read(&value, 6, 6, 0); - gpu_input->fill(0); - gpu_input->read(input.data()); + // std::cout << value << std::endl << std::endl; + + auto gpu_copy = cle::Array::create(gpu_input); + gpu_copy->fill(-5); + + gpu_input->copy(gpu_copy, region, { 1, 1, 0 }, { 3, 2, 0 }); + gpu_copy->read(input.data()); // for (int i = 0; i < input.size(); i++) // { @@ -110,7 +114,6 @@ run_test(const std::array & shape, const cle::mType & mem_type) -> bo // } // std::cout << std::endl; // std::cout << std::endl; - // std::cout << value << std::endl; std::cout << "all good\n"; }