diff --git a/cmake/dawn.cmake b/cmake/dawn.cmake index c6fed94..baed5ad 100644 --- a/cmake/dawn.cmake +++ b/cmake/dawn.cmake @@ -1,124 +1,167 @@ -# Setup directories -set(FETCHCONTENT_BASE_DIR "${PROJECT_ROOT}/third_party") -set(DAWN_DIR "${FETCHCONTENT_BASE_DIR}/dawn" CACHE INTERNAL "") -set(DAWN_BUILD_DIR "${DAWN_DIR}/build" CACHE INTERNAL "") +cmake_minimum_required(VERSION 3.14) +include(ExternalProject) +include(FetchContent) + +# include("${CMAKE_CURRENT_SOURCE_DIR}/cmake/print_target.cmake") + + +# Setup directories and basic paths +set(FETCHCONTENT_BASE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/external") +set(DAWN_DIR "${FETCHCONTENT_BASE_DIR}/dawn" CACHE INTERNAL "Dawn source directory") + +# For Emscripten builds (if desired) +set(EM_SDK_DIR $ENV{EMSDK} CACHE INTERNAL "") +set(EMSCRIPTEN_DIR "${EM_SDK_DIR}/upstream/emscripten" CACHE INTERNAL "") + +# Decide where to build Dawn’s build files. if(EMSCRIPTEN) - set(EM_SDK_DIR $ENV{EMSDK} CACHE INTERNAL "") - set(DAWN_BUILD_DIR "${DAWN_DIR}/build_web" CACHE INTERNAL "") - set(DAWN_EMSCRIPTEN_TOOLCHAIN ${EM_SDK_DIR}/upstream/emscripten CACHE INTERNAL "" FORCE) + set(DAWN_BUILD_DIR "${DAWN_DIR}/build_web" CACHE INTERNAL "web build directory" FORCE) +elseif(WIN32) + set(DAWN_BUILD_DIR "${DAWN_DIR}/build_win" CACHE INTERNAL "windows build directory" FORCE) +elseif(IOS) + set(DAWN_BUILD_DIR "${DAWN_DIR}/build_ios" CACHE INTERNAL "ios build directory" FORCE) +elseif(APPLE) + set(DAWN_BUILD_DIR "${DAWN_DIR}/build_mac" CACHE INTERNAL "mac build directory" FORCE) +elseif(ANDROID) + set(DAWN_BUILD_DIR "${DAWN_DIR}/build_android" CACHE INTERNAL "android build directory" FORCE) else() - add_compile_definitions(USE_DAWN_API) + set(DAWN_BUILD_DIR "${DAWN_DIR}/build_unix" CACHE INTERNAL "linux build directory" FORCE) endif() -# Enable find for no dawn rebuilds with flutter run -set(ENABLE_DAWN_FIND OFF CACHE BOOL "Enable finding Dawn" FORCE) +# Add Dawn header include directories so that they are available later. +include_directories(BEFORE PUBLIC + "${DAWN_BUILD_DIR}/src/dawn/native/" + "${DAWN_BUILD_DIR}/src/dawn/native/Debug" + "${DAWN_BUILD_DIR}/src/dawn/native/Release" +) + + +# Optionally try to find an existing Dawn build. +set(ENABLE_DAWN_FIND OFF CACHE BOOL "Attempt to find an existing Dawn build" FORCE) set(DAWN_BUILD_FOUND OFF CACHE BOOL "Dawn build found" FORCE) + if(ENABLE_DAWN_FIND) - # find_library, windows adds extra folder - if(MSVC) - find_library(WEBGPU_DAWN_DEBUG webgpu_dawn - NAMES webgpu_dawn - HINTS "${DAWN_BUILD_DIR}/src/dawn/native/Debug" - ) - find_library(WEBGPU_DAWN_RELEASE webgpu_dawn - NAMES webgpu_dawn - HINTS "${DAWN_BUILD_DIR}/src/dawn/native/Release" - ) - set(DAWN_BUILD_FOUND ON) - elseif(NOT EMSCRIPTEN AND NOT MSVC) - find_library(WEBGPU_DAWN_LIB - NAMES webgpu_dawn - PATHS "${DAWN_BUILD_DIR}/src/dawn/native" - REQUIRED - ) - set(DAWN_BUILD_FOUND ON) - else() - set(DAWN_BUILD_FOUND ON) + message(STATUS "Attempting to find an existing Dawn build...") + if(WIN32) + find_library(WEBGPU_DAWN_DEBUG NAMES webgpu_dawn HINTS "${DAWN_BUILD_DIR}/src/dawn/native/Debug") + find_library(WEBGPU_DAWN_RELEASE NAMES webgpu_dawn HINTS "${DAWN_BUILD_DIR}/src/dawn/native/Release") + + if(WEBGPU_DAWN_DEBUG OR WEBGPU_DAWN_RELEASE) + message(STATUS "Dawn build found on Windows. Debug: ${WEBGPU_DAWN_DEBUG}, Release: ${WEBGPU_DAWN_RELEASE}") + set(DAWN_BUILD_FOUND ON) + endif() + elseif(NOT EMSCRIPTEN AND NOT WIN32) + find_library(WEBGPU_DAWN_LIB NAMES webgpu_dawn.so PATHS "${DAWN_BUILD_DIR}/src/dawn/native") + + if(WEBGPU_DAWN_LIB) + message(STATUS "Dawn build found on Linux/Unix. Library: ${WEBGPU_DAWN_LIB}") + set(DAWN_BUILD_FOUND ON) endif() + endif() endif() -# Dawn options for more, -# see https://dawn.googlesource.com/dawn/+/refs/heads/main/CMakeLists.txt -set(DAWN_ALWAYS_ASSERT OFF CACHE INTERNAL "Always assert in Dawn" FORCE) -set(DAWN_BUILD_MONOLITHIC_LIBRARY ON CACHE INTERNAL "Build Dawn monolithically" FORCE) -set(DAWN_BUILD_EXAMPLES OFF CACHE INTERNAL "Build Dawn examples" FORCE) -set(DAWN_BUILD_SAMPLES OFF CACHE INTERNAL "Build Dawn samples" FORCE) -set(DAWN_BUILD_TESTS OFF CACHE INTERNAL "Build Dawn tests" FORCE) -set(DAWN_ENABLE_INSTALL OFF CACHE INTERNAL "Enable Dawn installation" FORCE) -set(DAWN_FETCH_DEPENDENCIES ON CACHE INTERNAL "Fetch Dawn dependencies" FORCE) -set(TINT_BUILD_TESTS OFF CACHE INTERNAL "Build Tint Tests" FORCE) -set(TINT_BUILD_IR_BINARY OFF CACHE INTERNAL "Build Tint IR binary" FORCE) -set(TINT_BUILD_CMD_TOOLS OFF CACHE INTERNAL "Build Tint command line tools" FORCE) +# Pre-build Dawn at configuration time if not already built. if(NOT DAWN_BUILD_FOUND) - include(FetchContent) - message("webgpu_dawn not found start building") - if(EMSCRIPTEN) - set(EMSCRIPTEN_DIR "${EM_SDK_DIR}/upstream/emscripten" CACHE INTERNAL "" FORCE) - set(DAWN_EMSCRIPTEN_TOOLCHAIN ${EMSCRIPTEN_DIR} CACHE INTERNAL "" FORCE) - endif() + message(STATUS "Dawn build not found - pre-building Dawn.") - FetchContent_Declare( - dawn - DOWNLOAD_DIR ${DAWN_DIR} - SOURCE_DIR ${DAWN_DIR} - SUBBUILD_DIR ${DAWN_BUILD_DIR}/tmp - BINARY_DIR ${DAWN_BUILD_DIR} - DOWNLOAD_COMMAND - cd ${DAWN_DIR} && - git init && - git fetch --depth=1 https://dawn.googlesource.com/dawn && - git reset --hard FETCH_HEAD - ) + # Force Dawn build options. + set(DAWN_ALWAYS_ASSERT ON CACHE INTERNAL "Always assert in Dawn" FORCE) + set(DAWN_BUILD_MONOLITHIC_LIBRARY ON CACHE INTERNAL "Build Dawn monolithically" FORCE) + set(DAWN_BUILD_EXAMPLES OFF CACHE INTERNAL "Build Dawn examples" FORCE) + set(DAWN_BUILD_SAMPLES OFF CACHE INTERNAL "Build Dawn samples" FORCE) + set(DAWN_BUILD_TESTS OFF CACHE INTERNAL "Build Dawn tests" FORCE) + set(DAWN_ENABLE_INSTALL OFF CACHE INTERNAL "Enable Dawn installation" FORCE) + set(DAWN_FETCH_DEPENDENCIES ON CACHE INTERNAL "Fetch Dawn dependencies" FORCE) + set(TINT_BUILD_TESTS OFF CACHE INTERNAL "Build Tint Tests" FORCE) + set(TINT_BUILD_IR_BINARY OFF CACHE INTERNAL "Build Tint IR binary" FORCE) + set(TINT_BUILD_CMD_TOOLS OFF CACHE INTERNAL "Build Tint command line tools" FORCE) + set(DAWN_EMSCRIPTEN_TOOLCHAIN ${EMSCRIPTEN_DIR} CACHE INTERNAL "Emscripten toolchain" FORCE) - # Download the repository and add it as a subdirectory. - FetchContent_MakeAvailable(dawn) + set(DAWN_COMMIT "66d57f910357befb441b91162f29a97f687af6d9" CACHE STRING "Dawn commit to checkout" FORCE) + + file(MAKE_DIRECTORY ${DAWN_DIR}) + # Initialize Git and set/update remote. + execute_process(COMMAND git init + WORKING_DIRECTORY "${DAWN_DIR}" + ) + execute_process( + COMMAND git remote add origin https://dawn.googlesource.com/dawn + WORKING_DIRECTORY "${DAWN_DIR}" + ) + # Fetch and checkout the specified commit. + execute_process( + COMMAND git fetch origin ${DAWN_COMMIT} + WORKING_DIRECTORY "${DAWN_DIR}" + ) + execute_process( + COMMAND git checkout ${DAWN_COMMIT} + WORKING_DIRECTORY "${DAWN_DIR}" + ) + execute_process( + COMMAND git reset --hard ${DAWN_COMMIT} + WORKING_DIRECTORY "${DAWN_DIR}" + ) + # Fetch the Dawn repository if not already present. + FetchContent_Declare( + dawn + SOURCE_DIR ${DAWN_DIR} + SUBBUILD_DIR ${DAWN_BUILD_DIR}/tmp + BINARY_DIR ${DAWN_BUILD_DIR} + ) + FetchContent_MakeAvailable(dawn) - # attempt fix flutter rebuilds - set(CMAKE_INCLUDE_PATH "${CMAKE_INCLUDE_PATH};${DAWN_DIR}/src" CACHE INTERNAL "") + set(CMAKE_INCLUDE_PATH "${CMAKE_INCLUDE_PATH};${DAWN_DIR}/src" CACHE INTERNAL "") - execute_process( - WORKING_DIRECTORY ${DAWN_DIR} - COMMAND ${CMAKE_COMMAND} -S ${DAWN_DIR} - -B ${DAWN_BUILD_DIR} - ) + set(DAWN_BUILD_FOUND ON) +endif() # End pre-build Dawn - # Build Dawn - execute_process( - COMMAND ${CMAKE_COMMAND} --build ${DAWN_BUILD_DIR} - ) - - # find_library, windows adds extra folder - if(MSVC) - find_library(WEBGPU_DAWN_DEBUG webgpu_dawn - NAMES webgpu_dawn - HINTS "${DAWN_BUILD_DIR}/src/dawn/native/Debug" - ) - find_library(WEBGPU_DAWN_RELEASE webgpu_dawn - NAMES webgpu_dawn - HINTS "${DAWN_BUILD_DIR}/src/dawn/native/Release" - ) - set(DAWN_BUILD_FOUND ON) - elseif(NOT EMSCRIPTEN AND NOT MSVC) - find_library(WEBGPU_DAWN_LIB - NAMES webgpu_dawn - PATHS "${DAWN_BUILD_DIR}/src/dawn/native" - REQUIRED - ) - set(DAWN_BUILD_FOUND ON) - else() - set(DAWN_BUILD_FOUND ON) - endif() +# Create an IMPORTED target for the Dawn library. +# Adjust the expected output name/extension per platform. +if(MSVC) +message(STATUS "Dawn build found on Windows.") +# MSVC: use separate debug and release dlls. +if((NOT WEBGPU_DAWN_DEBUG) OR (WEBGPU_DAWN_DEBUG MATCHES "NOTFOUND")) + find_library(WEBGPU_DAWN_DEBUG NAMES webgpu_dawn PATHS "${DAWN_BUILD_DIR}/src/dawn/native/Debug") +endif() +if((NOT WEBGPU_DAWN_RELEASE) OR (WEBGPU_DAWN_RELEASE MATCHES "NOTFOUND")) + find_library(WEBGPU_DAWN_RELEASE NAMES webgpu_dawn PATHS "${DAWN_BUILD_DIR}/src/dawn/native/Release") endif() -if(EMSCRIPTEN) - add_library(webgpu_dawn INTERFACE IMPORTED) - target_include_directories(webgpu_dawn INTERFACE ${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/include) - target_include_directories(webgpu_dawn INTERFACE ${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/include/webgpu/webgpu.h) - target_link_libraries(webgpu_dawn INTERFACE ${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_enum_tables.js) - target_link_libraries(webgpu_dawn INTERFACE ${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_generated_struct_info.js) - target_link_libraries(webgpu_dawn INTERFACE ${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_generated_sig_info.js) - target_link_libraries(webgpu_dawn INTERFACE ${DAWN_DIR}/third_party/emdawnwebgpu/library_webgpu.js) -else() +if(WEBGPU_DAWN_DEBUG OR WEBGPU_DAWN_RELEASE) + if(NOT TARGET webgpu_dawn) + add_library(webgpu_dawn INTERFACE) + target_link_libraries(webgpu_dawn INTERFACE + $<$:${WEBGPU_DAWN_DEBUG}> + $<$:${WEBGPU_DAWN_RELEASE}> + ) + endif() endif() +elseif(IOS) + # On iOS, it is common to build a static library. + if(NOT TARGET webgpu_dawn) + add_library(webgpu_dawn STATIC IMPORTED) + set_target_properties(webgpu_dawn PROPERTIES + IMPORTED_LOCATION "${DAWN_BUILD_DIR}/src/dawn/native/webgpu_dawn.a") + endif() +elseif(APPLE) + # On macOS (non-iOS), typically a dynamic library (.dylib) is built. + if(NOT TARGET webgpu_dawn) + add_library(webgpu_dawn SHARED IMPORTED) + set_target_properties(webgpu_dawn PROPERTIES + IMPORTED_LOCATION "${DAWN_BUILD_DIR}/src/dawn/native/webgpu_dawn.dylib") + endif() +elseif(ANDROID) + if(NOT TARGET webgpu_dawn) + add_library(webgpu_dawn SHARED IMPORTED) + set_target_properties(webgpu_dawn PROPERTIES + IMPORTED_LOCATION "${DAWN_BUILD_DIR}/src/dawn/native/webgpu_dawn.so") + endif() +elseif(NOT EMSCRIPTEN) # For Linux and other Unix-like systems. + if(NOT TARGET webgpu_dawn) + add_library(webgpu_dawn SHARED IMPORTED) + set_target_properties(webgpu_dawn PROPERTIES + IMPORTED_LOCATION "${DAWN_BUILD_DIR}/src/dawn/native/webgpu_dawn.so") + endif() +endif() \ No newline at end of file diff --git a/cmake/gpu.cmake b/cmake/gpu.cmake index d991a18..d57f083 100644 --- a/cmake/gpu.cmake +++ b/cmake/gpu.cmake @@ -15,7 +15,6 @@ message(STATUS "PROJECT_ROOT: ${PROJECT_ROOT}") set(GPU_SOURCES "${PROJECT_ROOT}/gpu.cpp" "${PROJECT_ROOT}/numeric_types/half.cpp" - "${DAWN_BUILD_DIR}/gen/include/dawn/webgpu.h" ) # Add headers diff --git a/gpu.hpp b/gpu.hpp index 69ed0e9..4b2afee 100644 --- a/gpu.hpp +++ b/gpu.hpp @@ -195,7 +195,16 @@ struct TensorPool { enum NumType { kf16, // (experimental) kf32, - ki32 + kf64, + ki8, + ki16, + ki32, + ki64, + ku8, + ku16, + ku32, + ku64, + kUnknown }; /** @@ -204,11 +213,27 @@ enum NumType { inline size_t sizeBytes(const NumType &type) { switch (type) { case kf16: - return sizeof(uint16_t); + return sizeof(half); case kf32: return sizeof(float); + case kf64: + return sizeof(double); + case ki8: + return sizeof(int8_t); + case ki16: + return sizeof(int16_t); case ki32: return sizeof(int32_t); + case ki64: + return sizeof(int64_t); + case ku8: + return sizeof(uint8_t); + case ku16: + return sizeof(uint16_t); + case ku32: + return sizeof(uint32_t); + case ku64: + return sizeof(uint64_t); default: LOG(kDefLog, kError, "Invalid NumType in size calculation."); return 0; @@ -224,8 +249,24 @@ inline std::string toString(NumType type) { return "f16"; case kf32: return "f32"; + case kf64: + return "f64"; + case ki8: + return "i8"; + case ki16: + return "i16"; case ki32: return "i32"; + case ki64: + return "i64"; + case ku8: + return "u8"; + case ku16: + return "u16"; + case ku32: + return "u32"; + case ku64: + return "u64"; default: LOG(kDefLog, kError, "Invalid NumType in string conversion."); return "unknown"; @@ -693,6 +734,18 @@ inline Tensor createTensor(Context &ctx, const Shape &shape, NumType dtype) { * Tensor tensor = createTensor(ctx, {256, 256}, kf32, data); * @endcode */ +inline Tensor createTensor(Context &ctx, const Shape &shape, NumType dtype, + const half *data) { + assert(dtype == kf16); + Tensor tensor = + createTensor(ctx.pool, ctx.device, shape, dtype, + WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst | + WGPUBufferUsage_CopySrc); + wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data, + tensor.data.size); + return tensor; +} + inline Tensor createTensor(Context &ctx, const Shape &shape, NumType dtype, const float *data) { assert(dtype == kf32); @@ -705,6 +758,32 @@ inline Tensor createTensor(Context &ctx, const Shape &shape, NumType dtype, return tensor; } +// Overload for double: pack each double into a float (losing precision) +inline Tensor createTensor(Context &ctx, const Shape &shape, NumType dtype, + const double *data) { + assert(dtype == kf64); + size_t numElements = size(shape); + // Each double (8 bytes) will be packed into 2 uint32_t values (2×4 bytes). + std::vector packed(numElements * 2); + for (size_t i = 0; i < numElements; ++i) { + uint64_t bits; + std::memcpy(&bits, &data[i], sizeof(double)); // Extract raw bits. + packed[2 * i] = static_cast(bits & 0xFFFFFFFF); + packed[2 * i + 1] = static_cast(bits >> 32); + } + // Create a tensor using the core overload that accepts a TensorPool and + // WGPUDevice. + Tensor tensor = + createTensor(ctx.pool, ctx.device, shape, kf64, + WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst | + WGPUBufferUsage_CopySrc); + + wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, packed.data(), + packed.size() * sizeof(uint32_t)); + + return tensor; +} + inline Tensor createTensor(Context &ctx, const Shape &shape, NumType dtype, const int32_t *data) { assert(dtype == ki32); @@ -717,27 +796,55 @@ inline Tensor createTensor(Context &ctx, const Shape &shape, NumType dtype, return tensor; } -/** - * @brief Overload of the tensor factory function to instantiate a tensor on - * the GPU with a given shape, data type. This overload also takes initial - * half* data to populate the tensor with. - * - * The data is assumed to be of size equal to the product of the dimensions in - * the shape, and is copied to the GPU buffer. - * - * @param[in] ctx Context instance to manage the tensor - * @param[in] shape Shape of the tensor - * @param[in] dtype Data type of the tensor (e.g. kf32) - * @param[in] data Initial data to populate the tensor with - * @return Tensor instance representing the created tensor - * - * @code - * Tensor tensor = createTensor(ctx, {256, 256}, kf32, data); - * @endcode - */ +// Overload for int8_t: pack four 8‑bit ints into one 32‑bit integer inline Tensor createTensor(Context &ctx, const Shape &shape, NumType dtype, - const half *data) { - assert(dtype == kf16); + const int8_t *data) { + assert(dtype == ki8); // unsupported: pack into ki32 + size_t numElements = size(shape); + size_t packedCount = (numElements + 3) / 4; + std::vector packed(packedCount, 0); + for (size_t i = 0; i < numElements; ++i) { + size_t idx = i / 4; + size_t shift = (i % 4) * 8; + // pack as unsigned bits then reinterpret; shader is then responsible for + // unpacking + packed[idx] |= (static_cast(data[i]) << shift); + } + return createTensor(ctx, shape, ki32, packed.data()); +} + +// Overload for int16_t: pack two 16‑bit ints into one 32‑bit integer +inline Tensor createTensor(Context &ctx, const Shape &shape, NumType dtype, + const int16_t *data) { + assert(dtype == ki16); // unsupported: pack into ki32 + size_t numElements = size(shape); + size_t packedCount = (numElements + 1) / 2; + std::vector packed(packedCount, 0); + for (size_t i = 0; i < numElements; ++i) { + size_t idx = i / 2; + size_t shift = (i % 2) * 16; + packed[idx] |= (static_cast(data[i]) << shift); + } + return createTensor(ctx, shape, ki32, packed.data()); +} + +// Overload for int64_t: pack each 64‑bit int into two 32‑bit integers +inline Tensor createTensor(Context &ctx, const Shape &shape, NumType dtype, + const int64_t *data) { + assert(dtype == ki64); // unsupported: pack into two ki32s + size_t numElements = size(shape); + std::vector packed(numElements * 2); + for (size_t i = 0; i < numElements; ++i) { + int64_t val = data[i]; + packed[2 * i] = static_cast(val & 0xFFFFFFFF); + packed[2 * i + 1] = static_cast((val >> 32) & 0xFFFFFFFF); + } + return createTensor(ctx, shape, ki32, packed.data()); +} + +inline Tensor createTensor(Context &ctx, const Shape &shape, NumType dtype, + const uint32_t *data) { + assert(dtype == ku32); Tensor tensor = createTensor(ctx.pool, ctx.device, shape, dtype, WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst | @@ -747,6 +854,53 @@ inline Tensor createTensor(Context &ctx, const Shape &shape, NumType dtype, return tensor; } +// Overload for uint8_t: pack four 8‑bit integers into one 32‑bit unsigned +// integer +inline Tensor createTensor(Context &ctx, const Shape &shape, NumType dtype, + const uint8_t *data) { + assert(dtype == ku8); // unsupported: pack into ku32 + size_t numElements = size(shape); + size_t packedCount = (numElements + 3) / 4; + std::vector packed(packedCount, 0); + for (size_t i = 0; i < numElements; ++i) { + size_t idx = i / 4; + size_t shift = (i % 4) * 8; + packed[idx] |= (static_cast(data[i]) << shift); + } + return createTensor(ctx, shape, ku32, packed.data()); +} + +// Overload for uint16_t: pack two 16‑bit integers into one 32‑bit unsigned +// integer +inline Tensor createTensor(Context &ctx, const Shape &shape, NumType dtype, + const uint16_t *data) { + assert(dtype == ku16); // unsupported: pack into ku32 + size_t numElements = size(shape); + size_t packedCount = (numElements + 1) / 2; + std::vector packed(packedCount, 0); + for (size_t i = 0; i < numElements; ++i) { + size_t idx = i / 2; + size_t shift = (i % 2) * 16; + packed[idx] |= (static_cast(data[i]) << shift); + } + return createTensor(ctx, shape, ku32, packed.data()); +} + +// Overload for uint64_t: pack each 64‑bit integer into two 32‑bit unsigned +// integers +inline Tensor createTensor(Context &ctx, const Shape &shape, NumType dtype, + const uint64_t *data) { + assert(dtype == ku64); // unsupported: pack into two ku32s + size_t numElements = size(shape); + std::vector packed(numElements * 2); + for (size_t i = 0; i < numElements; ++i) { + uint64_t val = data[i]; + packed[2 * i] = static_cast(val & 0xFFFFFFFF); + packed[2 * i + 1] = static_cast(val >> 32); + } + return createTensor(ctx, shape, ku32, packed.data()); +} + /** * @brief Frees a tensor resource and updates the tensor pool. * @@ -869,7 +1023,8 @@ template T wait(Context &ctx, std::future &f) { * Context ctx = waitForContextFuture(contextFuture); * @endcode */ -template T waitForContextFuture(std::future &f, size_t sleepTime = 10) { +template +T waitForContextFuture(std::future &f, size_t sleepTime = 10) { #ifdef __EMSCRIPTEN__ while (f.wait_for(std::chrono::milliseconds(0)) != std::future_status::ready) { @@ -1358,8 +1513,9 @@ inline void queueWorkDoneCallback(WGPUQueueWorkDoneStatus status, WGPUBufferMapCallbackInfo mapCallbackInfo = { .mode = WGPUCallbackMode_AllowSpontaneous, .callback = bufferMapCallback, - .userdata1 = const_cast(cbData), // Pass the callback data. - .userdata2 = nullptr // No additional user data. + .userdata1 = + const_cast(cbData), // Pass the callback data. + .userdata2 = nullptr // No additional user data. }; // Begin the asynchronous mapping of the readback buffer. @@ -1639,6 +1795,251 @@ inline void toCPU(Context &ctx, Tensor &tensor, std::array &data, wait(ctx, future); } +inline void toCPU(Context &ctx, Tensor &tensor, NumType dtype, void *output, + size_t sourceOffset = 0) { + size_t numElements = size(tensor.shape); + switch (dtype) { + // These types are directly supported. + case kf16: + case kf32: + case ku32: + case ki32: + toCPU(ctx, tensor, output, tensor.data.size, sourceOffset); + break; + + // kf64 to reverse bit‐packing of doubles. + case kf64: { + // We expect each double to have been packed into 2 uint32_t values. + std::vector tmp(numElements * 2); + // Read the packed data (each element is 4 bytes) + toCPU(ctx, tensor, tmp.data(), tmp.size() * sizeof(uint32_t), sourceOffset); + double *dst = static_cast(output); + for (size_t i = 0; i < numElements; ++i) { + uint32_t low = tmp[2 * i]; + uint32_t high = tmp[2 * i + 1]; + // Reassemble the 64-bit raw representation. + uint64_t bits = (static_cast(high) << 32) | low; + // Copy the raw bits into a double. + double d; + std::memcpy(&d, &bits, sizeof(double)); + dst[i] = d; + } + break; + } + + // For int8_t: four 8‑bit ints packed into one int32_t. + case ki8: { + size_t packedCount = (numElements + 3) / 4; + std::vector tmp(packedCount); + toCPU(ctx, tensor, tmp.data(), tmp.size() * sizeof(int32_t), sourceOffset); + int8_t *dst = static_cast(output); + for (size_t i = 0; i < numElements; ++i) { + size_t idx = i / 4; + size_t shift = (i % 4) * 8; + dst[i] = static_cast((tmp[idx] >> shift) & 0xFF); + } + break; + } + + // For int16_t: two 16‑bit ints packed into one int32_t. + case ki16: { + size_t packedCount = (numElements + 1) / 2; + std::vector tmp(packedCount); + toCPU(ctx, tensor, tmp.data(), tmp.size() * sizeof(int32_t), sourceOffset); + int16_t *dst = static_cast(output); + for (size_t i = 0; i < numElements; ++i) { + size_t idx = i / 2; + size_t shift = (i % 2) * 16; + dst[i] = static_cast((tmp[idx] >> shift) & 0xFFFF); + } + break; + } + + // For int64_t: each 64‑bit int was packed into two int32_t. + case ki64: { + std::vector tmp(numElements * 2); + toCPU(ctx, tensor, tmp.data(), tmp.size() * sizeof(int32_t), sourceOffset); + int64_t *dst = static_cast(output); + for (size_t i = 0; i < numElements; ++i) { + int32_t low = tmp[2 * i]; + int32_t high = tmp[2 * i + 1]; + dst[i] = + (static_cast(high) << 32) | (static_cast(low)); + } + break; + } + + // For uint8_t: four 8‑bit uints packed into one uint32_t. + case ku8: { + size_t packedCount = (numElements + 3) / 4; + std::vector tmp(packedCount); + toCPU(ctx, tensor, tmp.data(), tmp.size() * sizeof(uint32_t), sourceOffset); + uint8_t *dst = static_cast(output); + for (size_t i = 0; i < numElements; ++i) { + size_t idx = i / 4; + size_t shift = (i % 4) * 8; + dst[i] = static_cast((tmp[idx] >> shift) & 0xFF); + } + break; + } + + // For uint16_t: two 16‑bit uints packed into one uint32_t. + case ku16: { + size_t packedCount = (numElements + 1) / 2; + std::vector tmp(packedCount); + toCPU(ctx, tensor, tmp.data(), tmp.size() * sizeof(uint32_t), sourceOffset); + uint16_t *dst = static_cast(output); + for (size_t i = 0; i < numElements; ++i) { + size_t idx = i / 2; + size_t shift = (i % 2) * 16; + dst[i] = static_cast((tmp[idx] >> shift) & 0xFFFF); + } + break; + } + + // For uint64_t: each 64‑bit unsigned int was packed into two uint32_t. + case ku64: { + std::vector tmp(numElements * 2); + toCPU(ctx, tensor, tmp.data(), tmp.size() * sizeof(uint32_t), sourceOffset); + uint64_t *dst = static_cast(output); + for (size_t i = 0; i < numElements; ++i) { + uint32_t low = tmp[2 * i]; + uint32_t high = tmp[2 * i + 1]; + dst[i] = (static_cast(high) << 32) | low; + } + break; + } + + default: + LOG(kDefLog, kError, "Unsupported dtype in toCPUUnpack"); + break; + } +} + +inline void toCPU(Context &ctx, WGPUBuffer buffer, NumType dtype, void *output, + size_t numElements, size_t sourceOffset = 0) { + switch (dtype) { + // Directly supported types. + case kf16: + case kf32: + case ku32: + case ki32: { + size_t byteSize = numElements * sizeBytes(dtype); + toCPU(ctx, buffer, output, byteSize, sourceOffset); + break; + } + + // kf64 to reverse bit‐packing of doubles. + case kf64: { + // We expect each double to have been packed into 2 uint32_t values. + std::vector tmp(numElements * 2); + // Read the packed data (each element is 4 bytes) + toCPU(ctx, buffer, tmp.data(), tmp.size() * sizeof(uint32_t), sourceOffset); + double *dst = static_cast(output); + for (size_t i = 0; i < numElements; ++i) { + uint32_t low = tmp[2 * i]; + uint32_t high = tmp[2 * i + 1]; + // Reassemble the 64-bit raw representation. + uint64_t bits = (static_cast(high) << 32) | low; + // Copy the raw bits into a double. + double d; + std::memcpy(&d, &bits, sizeof(double)); + dst[i] = d; + } + break; + } + + // For int8_t: four 8‑bit ints packed into one int32_t. + case ki8: { + size_t packedCount = (numElements + 3) / 4; + std::vector tmp(packedCount); + toCPU(ctx, buffer, tmp.data(), tmp.size() * sizeof(int32_t), sourceOffset); + int8_t *dst = static_cast(output); + for (size_t i = 0; i < numElements; ++i) { + size_t idx = i / 4; + size_t shift = (i % 4) * 8; + dst[i] = static_cast((tmp[idx] >> shift) & 0xFF); + } + break; + } + + // For int16_t: two 16‑bit ints packed into one int32_t. + case ki16: { + size_t packedCount = (numElements + 1) / 2; + std::vector tmp(packedCount); + toCPU(ctx, buffer, tmp.data(), packedCount * sizeof(int32_t), sourceOffset); + int16_t *dst = static_cast(output); + for (size_t i = 0; i < numElements; ++i) { + size_t idx = i / 2; + size_t shift = (i % 2) * 16; + dst[i] = static_cast((tmp[idx] >> shift) & 0xFFFF); + } + break; + } + + // For int64_t: each 64‑bit int is packed into two int32_t. + case ki64: { + std::vector tmp(numElements * 2); + toCPU(ctx, buffer, tmp.data(), tmp.size() * sizeof(int32_t), sourceOffset); + int64_t *dst = static_cast(output); + for (size_t i = 0; i < numElements; ++i) { + int32_t low = tmp[2 * i]; + int32_t high = tmp[2 * i + 1]; + dst[i] = + (static_cast(high) << 32) | (static_cast(low)); + } + break; + } + + // For uint8_t: four 8‑bit uints packed into one uint32_t. + case ku8: { + size_t packedCount = (numElements + 3) / 4; + std::vector tmp(packedCount); + toCPU(ctx, buffer, tmp.data(), packedCount * sizeof(uint32_t), + sourceOffset); + uint8_t *dst = static_cast(output); + for (size_t i = 0; i < numElements; ++i) { + size_t idx = i / 4; + size_t shift = (i % 4) * 8; + dst[i] = static_cast((tmp[idx] >> shift) & 0xFF); + } + break; + } + + // For uint16_t: two 16‑bit uints packed into one uint32_t. + case ku16: { + size_t packedCount = (numElements + 1) / 2; + std::vector tmp(packedCount); + toCPU(ctx, buffer, tmp.data(), packedCount * sizeof(uint32_t), + sourceOffset); + uint16_t *dst = static_cast(output); + for (size_t i = 0; i < numElements; ++i) { + size_t idx = i / 2; + size_t shift = (i % 2) * 16; + dst[i] = static_cast((tmp[idx] >> shift) & 0xFFFF); + } + break; + } + + // For uint64_t: each 64‑bit unsigned int packed into two uint32_t. + case ku64: { + std::vector tmp(numElements * 2); + toCPU(ctx, buffer, tmp.data(), tmp.size() * sizeof(uint32_t), sourceOffset); + uint64_t *dst = static_cast(output); + for (size_t i = 0; i < numElements; ++i) { + uint32_t low = tmp[2 * i]; + uint32_t high = tmp[2 * i + 1]; + dst[i] = (static_cast(high) << 32) | low; + } + break; + } + + default: + LOG(kDefLog, kError, "Unsupported dtype in toCPU (raw buffer override)"); + break; + } +} + /** * @brief Copies data from CPU memory to a GPU buffer. The toGPU overloads are * effectively a convenience wrapper around the WebGPU API call @@ -1659,6 +2060,112 @@ inline void toGPU(Context &ctx, const void *data, WGPUBuffer buffer, wgpuQueueWriteBuffer(ctx.queue, buffer, 0, data, size); } +// Overload for float: directly copy the float data. +inline void toGPU(Context &ctx, const float *data, WGPUBuffer buffer, + size_t size) { + toGPU(ctx, static_cast(data), buffer, size); +} + +// Overload for half: directly copy the half data. +inline void toGPU(Context &ctx, const half *data, WGPUBuffer buffer, + size_t size) { + toGPU(ctx, static_cast(data), buffer, size); +} + +// Overload for double: bit-pack each double into two 32‑bit unsigned integers. +inline void toGPU(Context &ctx, const double *data, WGPUBuffer buffer, + size_t numElements) { + std::vector packed(numElements * 2); + for (size_t i = 0; i < numElements; ++i) { + uint64_t bits; + std::memcpy(&bits, &data[i], + sizeof(double)); // Reinterpret double as raw bits. + packed[2 * i] = static_cast(bits & 0xFFFFFFFF); + packed[2 * i + 1] = static_cast(bits >> 32); + } + toGPU(ctx, packed.data(), buffer, packed.size() * sizeof(uint32_t)); +} + +// Overload for int8_t: pack four 8‑bit ints into one 32‑bit integer. +inline void toGPU(Context &ctx, const int8_t *data, WGPUBuffer buffer, + size_t numElements) { + // Number of int8_t elements equals size (sizeof(int8_t)==1) + size_t packedCount = (numElements + 3) / 4; + std::vector packed(packedCount, 0); + for (size_t i = 0; i < numElements; ++i) { + size_t idx = i / 4; + size_t shift = (i % 4) * 8; + packed[idx] |= (static_cast(data[i]) << shift); + //LOG(kDefLog, kInfo, "toGPU: %d %d %d", data[i], packed[idx], idx); + } + toGPU(ctx, packed.data(), buffer, packedCount * sizeof(int32_t)); +} + +// Overload for int16_t: pack two 16‑bit ints into one 32‑bit integer. +inline void toGPU(Context &ctx, const int16_t *data, WGPUBuffer buffer, + size_t numElements) { + size_t packedCount = (numElements + 1) / 2; + std::vector packed(packedCount, 0); + for (size_t i = 0; i < numElements; ++i) { + size_t idx = i / 2; + size_t shift = (i % 2) * 16; + packed[idx] |= (static_cast(data[i]) << shift); + } + toGPU(ctx, packed.data(), buffer, packedCount * sizeof(int32_t)); +} + +// Overload for int64_t: pack each 64‑bit int into two 32‑bit integers. +inline void toGPU(Context &ctx, const int64_t *data, WGPUBuffer buffer, + size_t numElements) { + std::vector packed(numElements * 2); + for (size_t i = 0; i < numElements; ++i) { + int64_t val = data[i]; + packed[2 * i] = static_cast(val & 0xFFFFFFFF); + packed[2 * i + 1] = static_cast((val >> 32) & 0xFFFFFFFF); + } + toGPU(ctx, packed.data(), buffer, packed.size() * sizeof(int32_t)); +} + +// Overload for uint8_t: pack four 8‑bit uints into one 32‑bit unsigned integer. +inline void toGPU(Context &ctx, const uint8_t *data, WGPUBuffer buffer, + size_t numElements) { + size_t packedCount = (numElements + 3) / 4; + std::vector packed(packedCount, 0); + for (size_t i = 0; i < numElements; ++i) { + size_t idx = i / 4; + size_t shift = (i % 4) * 8; + packed[idx] |= (static_cast(data[i]) << shift); + } + toGPU(ctx, packed.data(), buffer, packedCount * sizeof(uint32_t)); +} + +// Overload for uint16_t: pack two 16‑bit uints into one 32‑bit unsigned +// integer. +inline void toGPU(Context &ctx, const uint16_t *data, WGPUBuffer buffer, + size_t numElements) { + size_t packedCount = (numElements + 1) / 2; + std::vector packed(packedCount, 0); + for (size_t i = 0; i < numElements; ++i) { + size_t idx = i / 2; + size_t shift = (i % 2) * 16; + packed[idx] |= (static_cast(data[i]) << shift); + } + toGPU(ctx, packed.data(), buffer, packedCount * sizeof(uint32_t)); +} + +// Overload for uint64_t: pack each 64‑bit uint into two 32‑bit unsigned +// integers. +inline void toGPU(Context &ctx, const uint64_t *data, WGPUBuffer buffer, + size_t numElements) { + std::vector packed(numElements * 2); + for (size_t i = 0; i < numElements; ++i) { + uint64_t val = data[i]; + packed[2 * i] = static_cast(val & 0xFFFFFFFF); + packed[2 * i + 1] = static_cast(val >> 32); + } + toGPU(ctx, packed.data(), buffer, packed.size() * sizeof(uint32_t)); +} + /** * @brief Overload of the toGPU function to copy data from CPU memory to a GPU * taking a Tensor instance instead of a WGPUBuffer instance. @@ -1680,22 +2187,105 @@ inline void toGPU(Context &ctx, const half *data, Tensor &tensor) { tensor.data.size); } -inline void toGPU(Context &ctx, const int *data, Tensor &tensor) { - wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data, +// Overload for double: bit-pack each double into two 32‑bit unsigned integers. +inline void toGPU(Context &ctx, const double *data, Tensor &tensor) { + size_t numElements = tensor.data.size / sizeof(double); + std::vector packed(numElements * 2); + for (size_t i = 0; i < numElements; ++i) { + uint64_t bits; + std::memcpy(&bits, &data[i], + sizeof(double)); // Reinterpret double as raw bits. + packed[2 * i] = static_cast(bits & 0xFFFFFFFF); + packed[2 * i + 1] = static_cast(bits >> 32); + } + toGPU(ctx, packed.data(), tensor.data.buffer, + packed.size() * sizeof(uint32_t)); +} + +// Overload for int8_t: pack four 8‑bit integers into one 32‑bit integer +inline void toGPU(Context &ctx, const int8_t *data, Tensor &tensor) { + size_t numElements = size(tensor.shape); + size_t packedCount = (numElements + 3) / 4; + std::vector packed(packedCount, 0); + for (size_t i = 0; i < numElements; ++i) { + size_t idx = i / 4; + size_t shift = (i % 4) * 8; + // Pack as unsigned then reinterpret (shader will unpack) + packed[idx] |= (static_cast(data[i]) << shift); + } + wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, packed.data(), tensor.data.size); } -inline void toGPU(Context &ctx, const float *data, Tensor &tensor, - size_t size) { - wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data, size); +// Overload for int16_t: pack two 16‑bit integers into one 32‑bit integer +inline void toGPU(Context &ctx, const int16_t *data, Tensor &tensor) { + size_t numElements = size(tensor.shape); + size_t packedCount = (numElements + 1) / 2; + std::vector packed(packedCount, 0); + for (size_t i = 0; i < numElements; ++i) { + size_t idx = i / 2; + size_t shift = (i % 2) * 16; + packed[idx] |= (static_cast(data[i]) << shift); + } + wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, packed.data(), + tensor.data.size); +} + +// Overload for int64_t: pack each 64‑bit integer into two 32‑bit integers +inline void toGPU(Context &ctx, const int64_t *data, Tensor &tensor) { + size_t numElements = size(tensor.shape); + std::vector packed(numElements * 2); + for (size_t i = 0; i < numElements; ++i) { + int64_t val = data[i]; + packed[2 * i] = static_cast(val & 0xFFFFFFFF); + packed[2 * i + 1] = static_cast((val >> 32) & 0xFFFFFFFF); + } + wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, packed.data(), + tensor.data.size); } -inline void toGPU(Context &ctx, const half *data, Tensor &tensor, size_t size) { - wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data, size); +// Overload for uint8_t: pack four 8‑bit unsigned integers into one 32‑bit +// unsigned +inline void toGPU(Context &ctx, const uint8_t *data, Tensor &tensor) { + size_t numElements = size(tensor.shape); + size_t packedCount = (numElements + 3) / 4; + std::vector packed(packedCount, 0); + for (size_t i = 0; i < numElements; ++i) { + size_t idx = i / 4; + size_t shift = (i % 4) * 8; + packed[idx] |= (static_cast(data[i]) << shift); + } + wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, packed.data(), + tensor.data.size); } -inline void toGPU(Context &ctx, const int *data, Tensor &tensor, size_t size) { - wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data, size); +// Overload for uint16_t: pack two 16‑bit unsigned integers into one 32‑bit +// unsigned +inline void toGPU(Context &ctx, const uint16_t *data, Tensor &tensor) { + size_t numElements = size(tensor.shape); + size_t packedCount = (numElements + 1) / 2; + std::vector packed(packedCount, 0); + for (size_t i = 0; i < numElements; ++i) { + size_t idx = i / 2; + size_t shift = (i % 2) * 16; + packed[idx] |= (static_cast(data[i]) << shift); + } + wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, packed.data(), + tensor.data.size); +} + +// Overload for uint64_t: pack each 64‑bit unsigned integer into two 32‑bit +// unsigned +inline void toGPU(Context &ctx, const uint64_t *data, Tensor &tensor) { + size_t numElements = size(tensor.shape); + std::vector packed(numElements * 2); + for (size_t i = 0; i < numElements; ++i) { + uint64_t val = data[i]; + packed[2 * i] = static_cast(val & 0xFFFFFFFF); + packed[2 * i + 1] = static_cast(val >> 32); + } + wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, packed.data(), + tensor.data.size); } template diff --git a/numeric_types/half.cpp b/numeric_types/half.cpp index c183754..e6e8d71 100644 --- a/numeric_types/half.cpp +++ b/numeric_types/half.cpp @@ -214,13 +214,7 @@ fn main( } } )"; - Context ctx = createContext( - {}, {}, - /*device descriptor, enabling f16 in WGSL*/ - { - .requiredFeatureCount = 1, - .requiredFeatures = std::array{WGPUFeatureName_ShaderF16}.data(), - }); + Context ctx = createContext(); static constexpr size_t N = 10000; std::array inputArr, outputArr; for (int i = 0; i < N; ++i) { @@ -238,7 +232,7 @@ fn main( } } -int testHalfMain() { +int testHalf() { printf("\nHalf-precision float tests\n==========================\n"); printf("\nRegular values float round trips\n\n"); diff --git a/numeric_types/half.hpp b/numeric_types/half.hpp index f78e61a..395e257 100644 --- a/numeric_types/half.hpp +++ b/numeric_types/half.hpp @@ -7,59 +7,27 @@ #include #include -#ifdef _MSC_VER -#include - -static inline uint32_t __builtin_clz(uint32_t value) -{ - unsigned long leading_zero = 0; - if (value == 0) - { - return 32; +// Counts leading zeros in a 16-bit number. +static inline uint16_t half_clz16(uint16_t value) { + uint16_t count = 0; + // Start at the highest bit (0x8000) + for (uint16_t mask = 0x8000; mask; mask >>= 1) { + if (value & mask) + break; + ++count; } - _BitScanReverse(&leading_zero, value); - return 31 - leading_zero; + return count; } -static inline uint16_t __builtin_clz(uint16_t value) -{ - return __builtin_clz(static_cast(value)) - 16; -} - -static inline uint64_t __builtin_clz(uint64_t value) -{ - unsigned long leading_zero = 0; - if (value == 0) - { - return 64; - } -#if defined(_WIN64) - _BitScanReverse64(&leading_zero, value); - return 63 - leading_zero; -#else - uint32_t high = static_cast(value >> 32); - uint32_t low = static_cast(value); - if (high != 0) - { - return __builtin_clz(high); - } - else - { - return 32 + __builtin_clz(low); - } -#endif -} -#endif - struct half; static inline half halfFromFloat(float f); static inline float halfToFloat(half h); +int testHalf(); /** * Experimental implementation of half-precision 16-bit floating point numbers. */ -struct half -{ +struct half { uint16_t data; // Default constructor @@ -77,22 +45,19 @@ struct half operator uint16_t() const { return data; } // Overload assignment operator from uint16_t - half &operator=(uint16_t value) - { + half &operator=(uint16_t value) { data = value; return *this; } // Overload assignment operator from another half - half &operator=(const half &other) - { + half &operator=(const half &other) { data = other.data; return *this; } // Overload assignment operator from float - half &operator=(float value) - { + half &operator=(float value) { data = halfFromFloat(value); return *this; } @@ -103,10 +68,8 @@ struct half * * Based on Mike Acton's half.c implementation. */ -half halfFromFloat(float f) -{ - union - { +half halfFromFloat(float f) { + union { float f; uint32_t u; } floatUnion = {f}; @@ -145,8 +108,7 @@ half halfFromFloat(float f) const uint32_t floatMantissa = float32 & FLOAT_MANTISSA_MASK; // Check for NaN - if ((floatExpMasked == FLOAT_EXP_MASK) && (floatMantissa != 0)) - { + if ((floatExpMasked == FLOAT_EXP_MASK) && (floatMantissa != 0)) { half result; result.data = HALF_EXP_MASK | (floatMantissa >> FLOAT_HALF_MANTISSA_POS_OFFSET); @@ -226,8 +188,7 @@ half halfFromFloat(float f) * * Based on Mike Acton's half.c implementation. */ -float halfToFloat(half h) -{ +float halfToFloat(half h) { // Constants for bit masks, shifts, and biases const uint16_t ONE = 0x0001; const uint16_t TWO = 0x0002; @@ -272,7 +233,7 @@ float halfToFloat(half h) const uint32_t isNan = isExpFlagged && isMantissaNonZero; // Handling denormalized numbers - const uint16_t halfMantissaLeadingZeros = __builtin_clz(halfMantissa) - 16; + const uint16_t halfMantissaLeadingZeros = half_clz16(halfMantissa); const uint16_t halfDenormShiftAmount = halfMantissaLeadingZeros + HALF_FLOAT_DENORM_SA_OFFSET; const uint32_t halfFloatDenormMantissaShiftAmount = @@ -308,8 +269,7 @@ float halfToFloat(half h) const uint32_t result = checkNanResult; // Reinterpret the uint32_t result as a float using a union - union - { + union { uint32_t u; float f; } floatUnion; diff --git a/test/test_gpu.cpp b/test/test_gpu.cpp index b855712..7f07dbf 100644 --- a/test/test_gpu.cpp +++ b/test/test_gpu.cpp @@ -1,4 +1,5 @@ #include "gpu.hpp" +#include "numeric_types/half.hpp" #include #include #include @@ -10,6 +11,114 @@ using namespace gpu; using namespace std::chrono; +// WGSL Kernels + +// Kernel to unpack 4x int8 (packed in i32) to 4x int32 +const char *kPackedInt8ToInt32Kernel = R"( + @group(0) @binding(0) var packed_input: array; + @group(0) @binding(1) var unpacked_output: array; + + // Function to sign-extend an 8-bit value (represented in the lower bits of an i32) + fn sign_extend_i8(val: i32) -> i32 { + return (val << 24) >> 24; + } + + @compute @workgroup_size({{workgroupSize}}) + fn main(@builtin(global_invocation_id) gid: vec3) { + let packed_idx: u32 = gid.x; + + // Check bounds for the PACKED input array + if (packed_idx >= arrayLength(&packed_input)) { + return; + } + + let packed_val = packed_input[packed_idx]; + + // Unpack and write 4 separate i32 values + // Ensure the output buffer is large enough (4x the packed size) + let base_output_idx = packed_idx * 4u; + + // Check bounds for the UNPACKED output array (optional but safer) + // This assumes arrayLength(&unpacked_output) is at least 4 * arrayLength(&packed_input) + if ((base_output_idx + 3u) >= arrayLength(&unpacked_output)) { + return; // Avoid out-of-bounds write if something is wrong + } + + unpacked_output[base_output_idx + 0u] = sign_extend_i8((packed_val >> 0u) & 0xFF); + unpacked_output[base_output_idx + 1u] = sign_extend_i8((packed_val >> 8u) & 0xFF); + unpacked_output[base_output_idx + 2u] = sign_extend_i8((packed_val >> 16u) & 0xFF); + unpacked_output[base_output_idx + 3u] = sign_extend_i8((packed_val >> 24u) & 0xFF); + } + )"; + +// Kernel to pack 4x int32 back into 1x int32 (taking lower 8 bits) +const char *kInt32ToPackedInt8Kernel = R"( + @group(0) @binding(0) var unpacked_input: array; + @group(0) @binding(1) var packed_output: array; + + @compute @workgroup_size({{workgroupSize}}) + fn main(@builtin(global_invocation_id) gid: vec3) { + let packed_idx: u32 = gid.x; // Index for the PACKED output array + + // Check bounds for the PACKED output array + if (packed_idx >= arrayLength(&packed_output)) { + return; + } + + let base_input_idx = packed_idx * 4u; + + // Check bounds for the UNPACKED input array (optional but safer) + // Assumes arrayLength(&unpacked_input) is at least 4 * arrayLength(&packed_output) + if ((base_input_idx + 3u) >= arrayLength(&unpacked_input)) { + // Handle potential error or incomplete data - maybe write 0? + packed_output[packed_idx] = 0; + return; + } + + // Read 4 separate i32 values + let val0 = unpacked_input[base_input_idx + 0u]; + let val1 = unpacked_input[base_input_idx + 1u]; + let val2 = unpacked_input[base_input_idx + 2u]; + let val3 = unpacked_input[base_input_idx + 3u]; + + // Pack the lower 8 bits of each into one i32 + var packed_result: i32 = 0; + packed_result = packed_result | ((val0 & 0xFF) << 0u); + packed_result = packed_result | ((val1 & 0xFF) << 8u); + packed_result = packed_result | ((val2 & 0xFF) << 16u); + packed_result = packed_result | ((val3 & 0xFF) << 24u); + + packed_output[packed_idx] = packed_result; + } + )"; + +// Simple addition kernel for i32 +const char *kSimpleAddKernelI32 = R"( + @group(0) @binding(0) var a: array<{{precision}}>; + @group(0) @binding(1) var b: array<{{precision}}>; + @group(0) @binding(2) var c: array<{{precision}}>; + + @compute @workgroup_size({{workgroupSize}}) + fn main(@builtin(global_invocation_id) gid: vec3) { + let i: u32 = gid.x; + if (i < arrayLength(&a)) { + c[i] = a[i] + b[i]; + } + } + )"; + +// A simple WGSL copy kernel that copies input to output. +static const char *kCopyKernel = R"( + @group(0) @binding(0) var inp: array<{{precision}}>; + @group(0) @binding(1) var out: array<{{precision}}>; + @compute @workgroup_size({{workgroupSize}}) + fn main(@builtin(global_invocation_id) gid: vec3) { + let i: u32 = gid.x; + if (i < arrayLength(&inp)) { + out[i] = inp[i]; + } + } + )"; // Forward declarations: void testToCPUWithTensor(); @@ -17,33 +126,649 @@ void testToCPUWithBuffer(); void testToCPUWithTensorSourceOffset(); void testToCPUWithBufferSourceOffset(); void stressTestToCPU(); +void testToCPUWithHalf(); +void testToCPUWithFloat(); +void testToCPUWithDouble(); +void testToCPUWithint8(); +void testToCPUWithint16(); +void testToCPUWithint(); +void testToCPUWithint64(); +void testToCPUWithUint8(); +void testToCPUWithUint16(); +void testToCPUWithUint32(); +void testToCPUWithUint64(); +void testNumTypeSizes(); +void testToCPUUnpack(); +void testCopyShaderPackedUnpack_int8(); +void testAddKernelInt8(); int main() { LOG(kDefLog, kInfo, "Running GPU integration tests..."); + testAddKernelInt8(); + testCopyShaderPackedUnpack_int8(); + testToCPUUnpack(); testToCPUWithTensor(); testToCPUWithBuffer(); testToCPUWithTensorSourceOffset(); testToCPUWithBufferSourceOffset(); + testToCPUWithHalf(); + testToCPUWithFloat(); + testToCPUWithDouble(); + testToCPUWithint8(); + testToCPUWithint16(); + testToCPUWithint(); + testToCPUWithint64(); + testToCPUWithUint8(); + testToCPUWithUint16(); + testToCPUWithUint32(); + testToCPUWithUint64(); + testNumTypeSizes(); stressTestToCPU(); + testHalf(); LOG(kDefLog, kInfo, "All tests passed."); return 0; } +void testAddKernelInt8() { + LOG(kDefLog, kInfo, "Running testAddKernelInt8 (with conversion kernels)..."); -// A simple WGSL copy kernel that copies input to output. -static const char *kCopyKernel = R"( -@group(0) @binding(0) var inp: array<{{precision}}>; -@group(0) @binding(1) var out: array<{{precision}}>; -@group(0) @binding(1) var dummy: array<{{precision}}>; -@compute @workgroup_size({{workgroupSize}}) -fn main(@builtin(global_invocation_id) gid: vec3) { - let i: u32 = gid.x; - if (i < arrayLength(&inp)) { - out[i] = inp[i]; +#ifdef USE_DAWN_API + Context ctx = createContextByGpuIdx(0); +#else + Context ctx = createContext(); +#endif + + constexpr size_t N = 1024; // Logical number of int8 elements + std::vector aInput(N), bInput(N), result(N); + std::vector expected(N); + + // CPU Data Setup + for (size_t i = 0; i < N; ++i) { + // Values in range [-10, 9] + aInput[i] = static_cast((i % 20) - 10); + bInput[i] = static_cast(((2 * i) % 20) - 10); + // Compute expected as int then cast back. + int temp = static_cast(aInput[i]) + static_cast(bInput[i]); + expected[i] = static_cast(temp); + result[i] = 0; + } + + // These store the int8 data packed into i32 format on the GPU + Tensor aTensorPacked = createTensor(ctx, Shape{N}, ki8, aInput.data()); + Tensor bTensorPacked = createTensor(ctx, Shape{N}, ki8, bInput.data()); + // Final output tensor, also in packed format + Tensor outputTensorPacked = createTensor(ctx, Shape{N}, ki8); + + // These will hold the data converted to one i32 per original int8 element + Tensor aTensorUnpacked = createTensor(ctx, Shape{N}, ki32); + Tensor bTensorUnpacked = createTensor(ctx, Shape{N}, ki32); + Tensor outputTensorUnpacked = + createTensor(ctx, Shape{N}, ki32); // For the simple add result + + constexpr uint32_t workgroupSize = 256; + size_t packedCount = (N + 3) / 4; // Number of i32 elements in packed buffers + size_t unpackedCount = N; // Number of i32 elements in unpacked buffers + + // Convert Packed Inputs to Unpacked i32 + Kernel unpackKernelA = + createKernel(ctx, {kPackedInt8ToInt32Kernel, workgroupSize, ki32}, + Bindings{aTensorPacked, aTensorUnpacked}, + {cdiv(packedCount, workgroupSize), 1, + 1}); // Dispatch based on packed size + Kernel unpackKernelB = + createKernel(ctx, {kPackedInt8ToInt32Kernel, workgroupSize, ki32}, + Bindings{bTensorPacked, bTensorUnpacked}, + {cdiv(packedCount, workgroupSize), 1, 1}); + // Dispatch based on packed size + dispatchKernel(ctx, unpackKernelA); + dispatchKernel(ctx, unpackKernelB); + + // Perform Simple Addition on Unpacked i32 + Kernel simpleAddKernel = createKernel( + ctx, {kSimpleAddKernelI32, workgroupSize, ki32}, + Bindings{aTensorUnpacked, bTensorUnpacked, outputTensorUnpacked}, + {cdiv(unpackedCount, workgroupSize), 1, + 1}); // Dispatch based on unpacked size + dispatchKernel(ctx, simpleAddKernel); + + // Convert Unpacked i32 Result back to Packed + Kernel packKernel = + createKernel(ctx, {kInt32ToPackedInt8Kernel, workgroupSize, ki32}, + Bindings{outputTensorUnpacked, outputTensorPacked}, + {cdiv(packedCount, workgroupSize), 1, + 1}); // Dispatch based on packed size + dispatchKernel(ctx, packKernel); + + // Copy Final Packed Result to CPU and Unpack + // Use the original toCPU for ki8, which handles the final CPU-side unpacking + toCPU(ctx, outputTensorPacked, ki8, result.data(), 0); + + for (size_t i = 0; i < N; ++i) { + assert(result[i] == expected[i]); + } + + LOG(kDefLog, kInfo, "testAddKernelInt8 (with conversion kernels) passed."); +} + +void testCopyShaderPackedUnpack_int8() { + LOG(kDefLog, kInfo, "Running testCopyShaderPackedUnpack_int8..."); + +#ifdef USE_DAWN_API + Context ctx = createContextByGpuIdx(0); +#else + Context ctx = createContext(); +#endif + + constexpr size_t N = 1024; + std::vector inputData(N), outputData(N); + for (size_t i = 0; i < N; ++i) { + // Values between -128 and 127. + inputData[i] = static_cast((i % 256) - 128); + } + + // Create an input tensor using the int8_t overload. + // Under the hood the data is packed into int32_t. + Tensor inputTensor = createTensor(ctx, Shape{N}, ki8, inputData.data()); + + // Create an output tensor of the same shape and unsupported type. + Tensor outputTensor = createTensor(ctx, Shape{N}, ki8); + + // Our copy shader (kCopyKernel) expects to work with supported types. + // Since int8_t is packed into int32_t, we pass 'ki32' as our shader + // precision. + Kernel copyKernel = + createKernel(ctx, {kCopyKernel, 256, ki32}, + Bindings{inputTensor, outputTensor}, {cdiv(N, 256), 1, 1}); + dispatchKernel(ctx, copyKernel); + + // Now retrieve the output from the GPU and unpack from the packed int32_t + // back to int8_t. + toCPU(ctx, outputTensor, ki8, outputData.data(), 0); + + // Verify the unpacked data matches the original input. + for (size_t i = 0; i < N; ++i) { + assert(inputData[i] == outputData[i]); + } + LOG(kDefLog, kInfo, "testCopyShaderPackedUnpack_int8 passed."); +} + +void testToCPUUnpack() { + LOG(kDefLog, kInfo, "Running testToCPUUnpack..."); + +#ifdef USE_DAWN_API + Context ctx = createContextByGpuIdx(0); +#else + Context ctx = createContext(); +#endif + + // Test for double (kf64 -> packed as kf32) + { + constexpr size_t N = 1024; + std::vector inputData(N), outputData(N); + for (size_t i = 0; i < N; ++i) { + inputData[i] = static_cast(i) * 3.14; + } + Tensor tensor = createTensor(ctx, Shape{N}, kf64, inputData.data()); + toCPU(ctx, tensor, kf64, outputData.data(), 0); + for (size_t i = 0; i < N; ++i) { + // Allow for a very small epsilon error due to float conversion. + assert(fabs(inputData[i] - outputData[i]) < 1e-4); + } + LOG(kDefLog, kInfo, "toCPUUnpack for double passed."); + } + + // Test for int8_t (ki8 -> packed as ki32) + { + constexpr size_t N = 1024; + std::vector inputData(N), outputData(N); + for (size_t i = 0; i < N; ++i) { + inputData[i] = static_cast((i % 256) - 128); + } + Tensor tensor = createTensor(ctx, Shape{N}, ki8, inputData.data()); + toCPU(ctx, tensor, ki8, outputData.data(), 0); + for (size_t i = 0; i < N; ++i) { + assert(inputData[i] == outputData[i]); + } + LOG(kDefLog, kInfo, "toCPUUnpack for int8_t passed."); + } + + // Test for int16_t (ki16 -> packed as ki32) + { + constexpr size_t N = 1024; + std::vector inputData(N), outputData(N); + for (size_t i = 0; i < N; ++i) { + inputData[i] = static_cast((i % 65536) - 32768); + } + Tensor tensor = createTensor(ctx, Shape{N}, ki16, inputData.data()); + toCPU(ctx, tensor, ki16, outputData.data(), 0); + for (size_t i = 0; i < N; ++i) { + assert(inputData[i] == outputData[i]); + } + LOG(kDefLog, kInfo, "toCPUUnpack for int16_t passed."); + } + + // Test for int64_t (ki64 -> packed as two ki32s) + { + constexpr size_t N = 1024; + std::vector inputData(N), outputData(N); + for (size_t i = 0; i < N; ++i) { + inputData[i] = static_cast(i) - 512; + } + Tensor tensor = createTensor(ctx, Shape{N}, ki64, inputData.data()); + toCPU(ctx, tensor, ki64, outputData.data(), 0); + for (size_t i = 0; i < N; ++i) { + assert(inputData[i] == outputData[i]); + } + LOG(kDefLog, kInfo, "toCPUUnpack for int64_t passed."); + } + + // Test for uint8_t (ku8 -> packed as ku32) + { + constexpr size_t N = 1024; + std::vector inputData(N), outputData(N); + for (size_t i = 0; i < N; ++i) { + inputData[i] = static_cast(i % 256); + } + Tensor tensor = createTensor(ctx, Shape{N}, ku8, inputData.data()); + toCPU(ctx, tensor, ku8, outputData.data(), 0); + for (size_t i = 0; i < N; ++i) { + assert(inputData[i] == outputData[i]); + } + LOG(kDefLog, kInfo, "toCPUUnpack for uint8_t passed."); + } + + // Test for uint16_t (ku16 -> packed as ku32) + { + constexpr size_t N = 1024; + std::vector inputData(N), outputData(N); + for (size_t i = 0; i < N; ++i) { + inputData[i] = static_cast(i % 65536); + } + Tensor tensor = createTensor(ctx, Shape{N}, ku16, inputData.data()); + toCPU(ctx, tensor, ku16, outputData.data(), 0); + for (size_t i = 0; i < N; ++i) { + assert(inputData[i] == outputData[i]); + } + LOG(kDefLog, kInfo, "toCPUUnpack for uint16_t passed."); + } + + // Test for uint64_t (ku64 -> packed as two ku32s) + { + constexpr size_t N = 1024; + std::vector inputData(N), outputData(N); + for (size_t i = 0; i < N; ++i) { + inputData[i] = static_cast(i) * 123456789ULL; + } + Tensor tensor = createTensor(ctx, Shape{N}, ku64, inputData.data()); + toCPU(ctx, tensor, ku64, outputData.data(), 0); + for (size_t i = 0; i < N; ++i) { + assert(inputData[i] == outputData[i]); + } + LOG(kDefLog, kInfo, "toCPUUnpack for uint64_t passed."); + } + + LOG(kDefLog, kInfo, "All toCPUUnpack tests passed."); +} + +void testNumTypeSizes() { + LOG(kDefLog, kInfo, "Running testNumTypeSizes..."); + + assert(sizeBytes(kf16) == 2); + assert(sizeBytes(kf32) == 4); + assert(sizeBytes(ki8) == sizeof(uint8_t)); // typically 1 + assert(sizeBytes(ki16) == sizeof(uint16_t)); // typically 2 + assert(sizeBytes(ki32) == sizeof(int32_t)); // typically 4 + assert(sizeBytes(ku8) == sizeof(uint8_t)); // typically 1 + assert(sizeBytes(ku16) == sizeof(uint16_t)); // typically 2 + assert(sizeBytes(ku32) == sizeof(uint32_t)); // typically 4 + + LOG(kDefLog, kInfo, "testNumTypeSizes passed."); +} + +// Test using half-precision (16-bit float) data. +void testToCPUWithHalf() { + LOG(kDefLog, kInfo, "Running testToCPUWithHalf..."); + +#ifdef USE_DAWN_API + Context ctx = createContextByGpuIdx(0); +#else + Context ctx = createContext(); +#endif + + constexpr size_t N = 1024; + std::array inputData, outputData; + for (size_t i = 0; i < N; ++i) { + // Construct half from float. + inputData[i] = half(static_cast(i)); + } + + Tensor inputTensor = createTensor(ctx, Shape{N}, kf16, inputData.data()); + + // Copy GPU output to CPU. + toCPU(ctx, inputTensor, outputData.data(), sizeof(outputData)); + + // Validate the copy (using float conversion for approximate equality). + for (size_t i = 0; i < N; ++i) { + float inVal = static_cast(inputData[i]); + float outVal = static_cast(outputData[i]); + // Use a small epsilon to compare half values. + assert(fabs(inVal - outVal) <= 0.01f); + } + LOG(kDefLog, kInfo, "testToCPUWithHalf passed."); +} + +// Test using float (32-bit) data. +void testToCPUWithFloat() { + LOG(kDefLog, kInfo, "Running testToCPUWithFloat..."); + +#ifdef USE_DAWN_API + Context ctx = createContextByGpuIdx(0); +#else + Context ctx = createContext(); +#endif + + constexpr size_t N = 1024; + std::array inputData, outputData; + for (size_t i = 0; i < N; ++i) { + inputData[i] = static_cast(i * 1.5f); + outputData[i] = 0.0f; + } + + Tensor inputTensor = createTensor(ctx, Shape{N}, kf32, inputData.data()); + + // Copy GPU output to CPU. + toCPU(ctx, inputTensor, outputData.data(), sizeof(outputData)); + + // Validate the copy. + for (size_t i = 0; i < N; ++i) { + assert(inputData[i] == outputData[i]); + } + LOG(kDefLog, kInfo, "testToCPUWithFloat passed."); +} + +// Test using double (64-bit floating point) data. +void testToCPUWithDouble() { + LOG(kDefLog, kInfo, "Running testToCPUWithDouble..."); + +#ifdef USE_DAWN_API + Context ctx = createContextByGpuIdx(0); +#else + Context ctx = createContext(); +#endif + + constexpr size_t N = 1024; + std::array inputData, outputData; + for (size_t i = 0; i < N; ++i) { + inputData[i] = static_cast(i) * 2.5; + outputData[i] = 0.0; + } + + Tensor inputTensor = createTensor(ctx, Shape{N}, kf64, inputData.data()); + + // Copy GPU output to CPU. + toCPU(ctx, inputTensor, outputData.data(), sizeof(outputData)); + + // Validate the copy. + for (size_t i = 0; i < N; ++i) { + assert(inputData[i] == outputData[i]); + } + LOG(kDefLog, kInfo, "testToCPUWithDouble passed."); +} + +void testToCPUWithint8() { + LOG(kDefLog, kInfo, "Running testToCPUWithint8..."); + +#ifdef USE_DAWN_API + Context ctx = createContextByGpuIdx(0); +#else + Context ctx = createContext(); +#endif + + constexpr size_t N = 1024; + std::array inputData, outputData; + // Use a range that includes negative values. + for (size_t i = 0; i < N; ++i) { + // Values between -128 and 127. + inputData[i] = static_cast((i % 256) - 128); + outputData[i] = 0; + } + + // Create a tensor for int8_t. + Tensor inputTensor = createTensor(ctx, Shape{N}, ki8, inputData.data()); + + // Synchronously copy the GPU tensor data to CPU. + toCPU(ctx, inputTensor, outputData.data(), sizeof(outputData)); + + // Validate the copy. + for (size_t i = 0; i < N; ++i) { + // LOG(kDefLog, kInfo, "inputData[%zu] = %d", i, inputData[i]); + // LOG(kDefLog, kInfo, "outputData[%zu] = %d", i, outputData[i]); + assert(outputData[i] == inputData[i]); + } + LOG(kDefLog, kInfo, "testToCPUWithint8 passed."); +} + +// Test using int16_t data. +void testToCPUWithint16() { + LOG(kDefLog, kInfo, "Running testToCPUWithint16..."); + +#ifdef USE_DAWN_API + Context ctx = createContextByGpuIdx(0); +#else + Context ctx = createContext(); +#endif + + constexpr size_t N = 1024; + std::array inputData, outputData; + // Use a range that includes negative values. + for (size_t i = 0; i < N; ++i) { + // Values between -32768 and 32767. + inputData[i] = static_cast((i % 65536) - 32768); + outputData[i] = 0; + } + + // Create a tensor for int16_t. + Tensor inputTensor = createTensor(ctx, Shape{N}, ki16, inputData.data()); + + // Synchronously copy the GPU tensor data to CPU. + toCPU(ctx, inputTensor, outputData.data(), sizeof(outputData)); + + // Validate the copy. + for (size_t i = 0; i < N; ++i) { + // LOG(kDefLog, kInfo, "inputData[%zu] = %d", i, inputData[i]); + // LOG(kDefLog, kInfo, "outputData[%zu] = %d", i, outputData[i]); + assert(outputData[i] == inputData[i]); + } + LOG(kDefLog, kInfo, "testToCPUWithint16 passed."); +} + +// Test using int (int32_t) data. +void testToCPUWithint() { + LOG(kDefLog, kInfo, "Running testToCPUWithint..."); + +#ifdef USE_DAWN_API + Context ctx = createContextByGpuIdx(0); +#else + Context ctx = createContext(); +#endif + + constexpr size_t N = 1024; + std::array inputData, outputData; + // Fill with sample data. + for (size_t i = 0; i < N; ++i) { + inputData[i] = + static_cast(i - 512); // Negative and positive values. + outputData[i] = 0; + } + + // Create a tensor for int32_t. + Tensor inputTensor = createTensor(ctx, Shape{N}, ki32, inputData.data()); + + // Synchronously copy the GPU tensor data to CPU. + toCPU(ctx, inputTensor, outputData.data(), sizeof(outputData)); + + // Validate the copy. + for (size_t i = 0; i < N; ++i) { + // LOG(kDefLog, kInfo, "inputData[%zu] = %d", i, inputData[i]); + // LOG(kDefLog, kInfo, "outputData[%zu] = %d", i, outputData[i]); + assert(outputData[i] == inputData[i]); + } + LOG(kDefLog, kInfo, "testToCPUWithint passed."); +} + +// Test using int64_t (64-bit signed integer) data. +void testToCPUWithint64() { + LOG(kDefLog, kInfo, "Running testToCPUWithint64..."); + +#ifdef USE_DAWN_API + Context ctx = createContextByGpuIdx(0); +#else + Context ctx = createContext(); +#endif + + constexpr size_t N = 1024; + std::array inputData, outputData; + for (size_t i = 0; i < N; ++i) { + inputData[i] = + static_cast(i) - 512; // Some negative and positive values. + outputData[i] = 0; + } + + Tensor inputTensor = createTensor(ctx, Shape{N}, ki64, inputData.data()); + + // Copy GPU output to CPU. + toCPU(ctx, inputTensor, outputData.data(), sizeof(outputData)); + + // Validate the copy. + for (size_t i = 0; i < N; ++i) { + assert(inputData[i] == outputData[i]); + } + LOG(kDefLog, kInfo, "testToCPUWithint64 passed."); +} + +void testToCPUWithUint8() { + LOG(kDefLog, kInfo, "Running testToCPUWithUint8..."); + +#ifdef USE_DAWN_API + Context ctx = createContextByGpuIdx(0); +#else + Context ctx = createContext(); +#endif + + constexpr size_t N = 1024; + std::array inputData, outputData; + for (size_t i = 0; i < N; ++i) { + inputData[i] = static_cast(i % 256); + outputData[i] = 0; } + + Tensor inputTensor = createTensor( + ctx, Shape{N}, ku8, reinterpret_cast(inputData.data())); + + toCPU(ctx, inputTensor, outputData.data(), sizeof(outputData)); + + // Verify the output matches the input. + for (size_t i = 0; i < N; ++i) { + // LOG(kDefLog, kInfo, "inputData[%zu] = %u", i, inputData[i]); + // LOG(kDefLog, kInfo, "outputData[%zu] = %u", i, outputData[i]); + assert(outputData[i] == inputData[i]); + } + LOG(kDefLog, kInfo, "testToCPUWithUint8 passed."); } -)"; +void testToCPUWithUint16() { + LOG(kDefLog, kInfo, "Running testToCPUWithUint16..."); + +#ifdef USE_DAWN_API + Context ctx = createContextByGpuIdx(0); +#else + Context ctx = createContext(); +#endif + + constexpr size_t N = 1024; + std::array inputData, outputData; + for (size_t i = 0; i < N; ++i) { + inputData[i] = static_cast(i % 65536); + outputData[i] = 0; + } + + Tensor inputTensor = + createTensor(ctx, Shape{N}, ku16, + reinterpret_cast(inputData.data())); + + // Synchronously copy GPU output to CPU using the tensor overload. + toCPU(ctx, inputTensor, outputData.data(), sizeof(outputData)); + + // Verify the output matches the input. + for (size_t i = 0; i < N; ++i) { + // LOG(kDefLog, kInfo, "inputData[%zu] = %u", i, inputData[i]); + // LOG(kDefLog, kInfo, "outputData[%zu] = %u", i, outputData[i]); + assert(outputData[i] == inputData[i]); + } + LOG(kDefLog, kInfo, "testToCPUWithUint16 passed."); +} + +void testToCPUWithUint32() { + LOG(kDefLog, kInfo, "Running testToCPUWithUint32..."); + +#ifdef USE_DAWN_API + Context ctx = createContextByGpuIdx(0); +#else + Context ctx = createContext(); +#endif + + constexpr size_t N = 1024; + std::array inputData, outputData; + for (size_t i = 0; i < N; ++i) { + inputData[i] = static_cast(i); + outputData[i] = 0; + } + + Tensor inputTensor = + createTensor(ctx, Shape{N}, ku32, + reinterpret_cast(inputData.data())); + + // Synchronously copy GPU output to CPU using the tensor overload. + toCPU(ctx, inputTensor, outputData.data(), sizeof(outputData)); + + // Verify the output matches the input. + for (size_t i = 0; i < N; ++i) { + // LOG(kDefLog, kInfo, "inputData[%zu] = %u", i, inputData[i]); + // LOG(kDefLog, kInfo, "outputData[%zu] = %u", i, outputData[i]); + assert(outputData[i] == inputData[i]); + } + LOG(kDefLog, kInfo, "testToCPUWithUint32 passed."); +} + +// Test using uint64_t (64-bit unsigned integer) data. +void testToCPUWithUint64() { + LOG(kDefLog, kInfo, "Running testToCPUWithUint64..."); + +#ifdef USE_DAWN_API + Context ctx = createContextByGpuIdx(0); +#else + Context ctx = createContext(); +#endif + + constexpr size_t N = 1024; + std::array inputData, outputData; + for (size_t i = 0; i < N; ++i) { + inputData[i] = static_cast(i); + outputData[i] = 0; + } + + // Assuming a new NumType 'ku64' for 64-bit unsigned integers. + Tensor inputTensor = createTensor(ctx, Shape{N}, ku64, inputData.data()); + + // Copy GPU output to CPU. + toCPU(ctx, inputTensor, outputData.data(), sizeof(outputData)); + + // Validate the copy. + for (size_t i = 0; i < N; ++i) { + assert(inputData[i] == outputData[i]); + } + LOG(kDefLog, kInfo, "testToCPUWithUint64 passed."); +} // Test using the overload that takes a Tensor. void testToCPUWithTensor() { @@ -78,8 +803,8 @@ void testToCPUWithTensor() { // Verify the output matches the input. for (size_t i = 0; i < N; ++i) { - LOG(kDefLog, kInfo, "inputData[%zu] = %f", i, inputData[i]); - LOG(kDefLog, kInfo, "outputData[%zu] = %f", i, outputData[i]); + // LOG(kDefLog, kInfo, "inputData[%zu] = %f", i, inputData[i]); + // LOG(kDefLog, kInfo, "outputData[%zu] = %f", i, outputData[i]); assert(outputData[i] == inputData[i]); } LOG(kDefLog, kInfo, "testToCPUWithTensor passed."); @@ -116,7 +841,7 @@ void testToCPUWithBuffer() { // Verify that the CPU output matches the original data. for (size_t i = 0; i < N; ++i) { - LOG(kDefLog, kInfo, "outputData[%zu] = %f", i, outputData[i]); + // LOG(kDefLog, kInfo, "outputData[%zu] = %f", i, outputData[i]); assert(outputData[i] == data[i]); } LOG(kDefLog, kInfo, "testToCPUWithBuffer passed."); @@ -158,8 +883,8 @@ void testToCPUWithTensorSourceOffset() { for (size_t i = 0; i < copyCount; ++i) { float expected = inputData[sourceOffsetElements + i]; float actual = cpuOutput[i]; - LOG(kDefLog, kInfo, "cpuOutput[%zu] = %f", i, actual); - LOG(kDefLog, kInfo, "expected[%zu] = %f", i, expected); + // LOG(kDefLog, kInfo, "cpuOutput[%zu] = %f", i, actual); + // LOG(kDefLog, kInfo, "expected[%zu] = %f", i, expected); assert(expected == actual); } LOG(kDefLog, kInfo, "testToCPUWithTensorSourceOffset passed."); @@ -201,8 +926,8 @@ void testToCPUWithBufferSourceOffset() { for (size_t i = 0; i < copyCount; ++i) { float expected = inputData[sourceOffsetElements + i]; float actual = cpuOutput[i]; - LOG(kDefLog, kInfo, "cpuOutput[%zu] = %f", i, actual); - LOG(kDefLog, kInfo, "expected[%zu] = %f", i, expected); + // LOG(kDefLog, kInfo, "cpuOutput[%zu] = %f", i, actual); + // LOG(kDefLog, kInfo, "expected[%zu] = %f", i, expected); assert(expected == actual); } LOG(kDefLog, kInfo, "testToCPUWithBufferSourceOffset passed."); @@ -229,22 +954,24 @@ void stressTestToCPU() { auto startTime = high_resolution_clock::now(); size_t opCount = 0; while (high_resolution_clock::now() - startTime < seconds(2)) { - // Allocate an output buffer (using a shared_ptr so it stays valid until the future completes) + // Allocate an output buffer (using a shared_ptr so it stays valid until the + // future completes) auto outputData = std::make_shared>(N, 0.0f); // Use the tensor overload; we’re copying the entire tensor (destOffset = 0) - LOG(kDefLog, kInfo, "Copying %zu bytes from GPU to CPU...", N * sizeof(float)); // log count - LOG(kDefLog, kInfo, "opCount = %zu", opCount); - auto fut = toCPUAsync(ctx, tensor, outputData->data(), N * sizeof(float), 0); + auto fut = + toCPUAsync(ctx, tensor, outputData->data(), N * sizeof(float), 0); wait(ctx, fut); ++opCount; } - + auto endTime = high_resolution_clock::now(); auto totalMs = duration_cast(endTime - startTime).count(); double throughput = (opCount / (totalMs / 1000.0)); - LOG(kDefLog, kInfo, "Stress test completed:\n" - " %zu GPU to CPU operations in %lld ms\n" - " Throughput: %.2f ops/sec", opCount, totalMs, throughput); + LOG(kDefLog, kInfo, + "Stress test completed:\n" + " %zu GPU to CPU operations in %lld ms\n" + " Throughput: %.2f ops/sec", + opCount, totalMs, throughput); }