diff --git a/cmake/dawn.cmake b/cmake/dawn.cmake
index c6fed94..baed5ad 100644
--- a/cmake/dawn.cmake
+++ b/cmake/dawn.cmake
@@ -1,124 +1,167 @@
-# Setup directories
-set(FETCHCONTENT_BASE_DIR "${PROJECT_ROOT}/third_party")
-set(DAWN_DIR "${FETCHCONTENT_BASE_DIR}/dawn" CACHE INTERNAL "")
-set(DAWN_BUILD_DIR "${DAWN_DIR}/build" CACHE INTERNAL "")
+cmake_minimum_required(VERSION 3.14)
 
+include(ExternalProject)
+include(FetchContent)
+
+# include("${CMAKE_CURRENT_SOURCE_DIR}/cmake/print_target.cmake")
+
+
+# Setup directories and basic paths
+set(FETCHCONTENT_BASE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/external")
+set(DAWN_DIR           "${FETCHCONTENT_BASE_DIR}/dawn" CACHE INTERNAL "Dawn source directory")
+
+# For Emscripten builds (if desired)
+set(EM_SDK_DIR         $ENV{EMSDK} CACHE INTERNAL "")
+set(EMSCRIPTEN_DIR     "${EM_SDK_DIR}/upstream/emscripten" CACHE INTERNAL "")
+
+# Decide where to build Dawn’s build files.
 if(EMSCRIPTEN)
-    set(EM_SDK_DIR $ENV{EMSDK} CACHE INTERNAL "")
-    set(DAWN_BUILD_DIR "${DAWN_DIR}/build_web" CACHE INTERNAL "")
-    set(DAWN_EMSCRIPTEN_TOOLCHAIN ${EM_SDK_DIR}/upstream/emscripten CACHE INTERNAL "" FORCE)
+  set(DAWN_BUILD_DIR "${DAWN_DIR}/build_web" CACHE INTERNAL "web build directory" FORCE)
+elseif(WIN32)
+  set(DAWN_BUILD_DIR "${DAWN_DIR}/build_win" CACHE INTERNAL "windows build directory" FORCE)
+elseif(IOS)
+  set(DAWN_BUILD_DIR "${DAWN_DIR}/build_ios" CACHE INTERNAL "ios build directory" FORCE)
+elseif(APPLE)
+  set(DAWN_BUILD_DIR "${DAWN_DIR}/build_mac" CACHE INTERNAL "mac build directory" FORCE)
+elseif(ANDROID)
+  set(DAWN_BUILD_DIR "${DAWN_DIR}/build_android" CACHE INTERNAL "android build directory" FORCE)
 else()
-    add_compile_definitions(USE_DAWN_API)
+  set(DAWN_BUILD_DIR "${DAWN_DIR}/build_unix" CACHE INTERNAL "linux build directory" FORCE)
 endif()
 
-# Enable find for no dawn rebuilds with flutter run
-set(ENABLE_DAWN_FIND OFF CACHE BOOL "Enable finding Dawn" FORCE)
+# Add Dawn header include directories so that they are available later.
+include_directories(BEFORE PUBLIC 
+  "${DAWN_BUILD_DIR}/src/dawn/native/"
+  "${DAWN_BUILD_DIR}/src/dawn/native/Debug"
+  "${DAWN_BUILD_DIR}/src/dawn/native/Release"
+)
+
+
+# Optionally try to find an existing Dawn build.
+set(ENABLE_DAWN_FIND OFF CACHE BOOL "Attempt to find an existing Dawn build" FORCE)
 set(DAWN_BUILD_FOUND OFF CACHE BOOL "Dawn build found" FORCE)
+
 if(ENABLE_DAWN_FIND)
-    # find_library, windows adds extra folder
-    if(MSVC)
-        find_library(WEBGPU_DAWN_DEBUG webgpu_dawn
-        NAMES webgpu_dawn
-        HINTS "${DAWN_BUILD_DIR}/src/dawn/native/Debug"
-        )
-        find_library(WEBGPU_DAWN_RELEASE webgpu_dawn
-        NAMES webgpu_dawn
-        HINTS "${DAWN_BUILD_DIR}/src/dawn/native/Release"
-        )
-        set(DAWN_BUILD_FOUND ON)
-    elseif(NOT EMSCRIPTEN AND NOT MSVC)
-        find_library(WEBGPU_DAWN_LIB
-        NAMES webgpu_dawn
-        PATHS "${DAWN_BUILD_DIR}/src/dawn/native"
-        REQUIRED
-        )
-        set(DAWN_BUILD_FOUND ON)
-    else()
-        set(DAWN_BUILD_FOUND ON)
+    message(STATUS "Attempting to find an existing Dawn build...")
+  if(WIN32)
+    find_library(WEBGPU_DAWN_DEBUG NAMES webgpu_dawn HINTS "${DAWN_BUILD_DIR}/src/dawn/native/Debug")
+    find_library(WEBGPU_DAWN_RELEASE NAMES webgpu_dawn HINTS "${DAWN_BUILD_DIR}/src/dawn/native/Release")
+    
+    if(WEBGPU_DAWN_DEBUG OR WEBGPU_DAWN_RELEASE)
+    message(STATUS "Dawn build found on Windows. Debug: ${WEBGPU_DAWN_DEBUG}, Release: ${WEBGPU_DAWN_RELEASE}")
+      set(DAWN_BUILD_FOUND ON)
+    endif()
+  elseif(NOT EMSCRIPTEN AND NOT WIN32)
+    find_library(WEBGPU_DAWN_LIB NAMES webgpu_dawn.so PATHS "${DAWN_BUILD_DIR}/src/dawn/native")
+    
+    if(WEBGPU_DAWN_LIB)
+    message(STATUS "Dawn build found on Linux/Unix. Library: ${WEBGPU_DAWN_LIB}")
+      set(DAWN_BUILD_FOUND ON)
     endif()
+  endif()
 endif()
 
-# Dawn options for more,
-# see https://dawn.googlesource.com/dawn/+/refs/heads/main/CMakeLists.txt
-set(DAWN_ALWAYS_ASSERT     OFF CACHE INTERNAL "Always assert in Dawn" FORCE)
-set(DAWN_BUILD_MONOLITHIC_LIBRARY ON CACHE INTERNAL "Build Dawn monolithically" FORCE)
-set(DAWN_BUILD_EXAMPLES      OFF CACHE INTERNAL "Build Dawn examples" FORCE)
-set(DAWN_BUILD_SAMPLES      OFF CACHE INTERNAL "Build Dawn samples" FORCE)
-set(DAWN_BUILD_TESTS         OFF CACHE INTERNAL "Build Dawn tests" FORCE)
-set(DAWN_ENABLE_INSTALL      OFF  CACHE INTERNAL "Enable Dawn installation" FORCE)
-set(DAWN_FETCH_DEPENDENCIES ON  CACHE INTERNAL "Fetch Dawn dependencies" FORCE)
-set(TINT_BUILD_TESTS        OFF CACHE INTERNAL "Build Tint Tests" FORCE)
-set(TINT_BUILD_IR_BINARY    OFF CACHE INTERNAL "Build Tint IR binary" FORCE)
-set(TINT_BUILD_CMD_TOOLS   OFF CACHE INTERNAL "Build Tint command line tools" FORCE)
 
+# Pre-build Dawn at configuration time if not already built.
 if(NOT DAWN_BUILD_FOUND)
-    include(FetchContent)
-    message("webgpu_dawn not found start building")
-    if(EMSCRIPTEN)
-        set(EMSCRIPTEN_DIR "${EM_SDK_DIR}/upstream/emscripten" CACHE INTERNAL "" FORCE)
-        set(DAWN_EMSCRIPTEN_TOOLCHAIN ${EMSCRIPTEN_DIR} CACHE INTERNAL "" FORCE)
-    endif()
+  message(STATUS "Dawn build not found - pre-building Dawn.")
 
-    FetchContent_Declare(
-        dawn
-        DOWNLOAD_DIR ${DAWN_DIR}
-        SOURCE_DIR ${DAWN_DIR}
-        SUBBUILD_DIR ${DAWN_BUILD_DIR}/tmp
-        BINARY_DIR ${DAWN_BUILD_DIR}
-        DOWNLOAD_COMMAND
-        cd ${DAWN_DIR} &&
-        git init &&
-        git fetch --depth=1 https://dawn.googlesource.com/dawn &&
-        git reset --hard FETCH_HEAD
-    )
+  # Force Dawn build options.
+  set(DAWN_ALWAYS_ASSERT           ON CACHE INTERNAL "Always assert in Dawn" FORCE)
+  set(DAWN_BUILD_MONOLITHIC_LIBRARY ON CACHE INTERNAL "Build Dawn monolithically" FORCE)
+  set(DAWN_BUILD_EXAMPLES          OFF CACHE INTERNAL "Build Dawn examples" FORCE)
+  set(DAWN_BUILD_SAMPLES           OFF CACHE INTERNAL "Build Dawn samples" FORCE)
+  set(DAWN_BUILD_TESTS             OFF CACHE INTERNAL "Build Dawn tests" FORCE)
+  set(DAWN_ENABLE_INSTALL          OFF  CACHE INTERNAL "Enable Dawn installation" FORCE)
+  set(DAWN_FETCH_DEPENDENCIES      ON  CACHE INTERNAL "Fetch Dawn dependencies" FORCE)
+  set(TINT_BUILD_TESTS             OFF CACHE INTERNAL "Build Tint Tests" FORCE)
+  set(TINT_BUILD_IR_BINARY         OFF CACHE INTERNAL "Build Tint IR binary" FORCE)
+  set(TINT_BUILD_CMD_TOOLS         OFF CACHE INTERNAL "Build Tint command line tools" FORCE)
+  set(DAWN_EMSCRIPTEN_TOOLCHAIN    ${EMSCRIPTEN_DIR} CACHE INTERNAL "Emscripten toolchain" FORCE)
 
-    # Download the repository and add it as a subdirectory.
-    FetchContent_MakeAvailable(dawn)
+  set(DAWN_COMMIT "66d57f910357befb441b91162f29a97f687af6d9" CACHE STRING "Dawn commit to checkout" FORCE)
+  
+  file(MAKE_DIRECTORY ${DAWN_DIR})
+  # Initialize Git and set/update remote.
+  execute_process(COMMAND git init
+  WORKING_DIRECTORY "${DAWN_DIR}"
+  )
+  execute_process(
+    COMMAND git remote add origin https://dawn.googlesource.com/dawn
+    WORKING_DIRECTORY "${DAWN_DIR}"
+  )
+  # Fetch and checkout the specified commit.
+  execute_process(
+  COMMAND git fetch origin ${DAWN_COMMIT}
+  WORKING_DIRECTORY "${DAWN_DIR}"
+  )
+  execute_process(
+  COMMAND git checkout ${DAWN_COMMIT}
+  WORKING_DIRECTORY "${DAWN_DIR}"
+  )
+  execute_process(
+  COMMAND git reset --hard ${DAWN_COMMIT}
+  WORKING_DIRECTORY "${DAWN_DIR}"
+  )
+  # Fetch the Dawn repository if not already present.
+  FetchContent_Declare(
+    dawn
+    SOURCE_DIR   ${DAWN_DIR}
+    SUBBUILD_DIR ${DAWN_BUILD_DIR}/tmp
+    BINARY_DIR   ${DAWN_BUILD_DIR}
+  )
+  FetchContent_MakeAvailable(dawn)
 
-    # attempt fix flutter rebuilds
-    set(CMAKE_INCLUDE_PATH "${CMAKE_INCLUDE_PATH};${DAWN_DIR}/src" CACHE INTERNAL "")
+  set(CMAKE_INCLUDE_PATH "${CMAKE_INCLUDE_PATH};${DAWN_DIR}/src" CACHE INTERNAL "")
 
-    execute_process(
-        WORKING_DIRECTORY ${DAWN_DIR}
-        COMMAND ${CMAKE_COMMAND} -S ${DAWN_DIR}
-            -B ${DAWN_BUILD_DIR}
-    )
+  set(DAWN_BUILD_FOUND ON)
+endif()  # End pre-build Dawn
 
-    # Build Dawn
-    execute_process(
-        COMMAND ${CMAKE_COMMAND} --build ${DAWN_BUILD_DIR}
-    )
-    
-    # find_library, windows adds extra folder
-    if(MSVC)
-        find_library(WEBGPU_DAWN_DEBUG webgpu_dawn
-        NAMES webgpu_dawn
-        HINTS "${DAWN_BUILD_DIR}/src/dawn/native/Debug"
-        )
-        find_library(WEBGPU_DAWN_RELEASE webgpu_dawn
-        NAMES webgpu_dawn
-        HINTS "${DAWN_BUILD_DIR}/src/dawn/native/Release"
-        )
-        set(DAWN_BUILD_FOUND ON)
-    elseif(NOT EMSCRIPTEN AND NOT MSVC)
-        find_library(WEBGPU_DAWN_LIB
-        NAMES webgpu_dawn
-        PATHS "${DAWN_BUILD_DIR}/src/dawn/native"
-        REQUIRED
-        )
-        set(DAWN_BUILD_FOUND ON)
-    else()
-        set(DAWN_BUILD_FOUND ON)
-    endif()
+# Create an IMPORTED target for the Dawn library.
+# Adjust the expected output name/extension per platform.
+if(MSVC)
+message(STATUS "Dawn build found on Windows.")
+# MSVC: use separate debug and release dlls.
+if((NOT WEBGPU_DAWN_DEBUG) OR (WEBGPU_DAWN_DEBUG MATCHES "NOTFOUND"))
+  find_library(WEBGPU_DAWN_DEBUG NAMES webgpu_dawn PATHS "${DAWN_BUILD_DIR}/src/dawn/native/Debug")
+endif()
+if((NOT WEBGPU_DAWN_RELEASE) OR (WEBGPU_DAWN_RELEASE MATCHES "NOTFOUND"))
+  find_library(WEBGPU_DAWN_RELEASE NAMES webgpu_dawn PATHS "${DAWN_BUILD_DIR}/src/dawn/native/Release")
 endif()
 
-if(EMSCRIPTEN)
-    add_library(webgpu_dawn INTERFACE IMPORTED)
-    target_include_directories(webgpu_dawn INTERFACE ${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/include)
-    target_include_directories(webgpu_dawn INTERFACE ${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/include/webgpu/webgpu.h)
-    target_link_libraries(webgpu_dawn INTERFACE ${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_enum_tables.js)
-    target_link_libraries(webgpu_dawn INTERFACE ${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_generated_struct_info.js)
-    target_link_libraries(webgpu_dawn INTERFACE ${DAWN_BUILD_DIR}/gen/src/emdawnwebgpu/library_webgpu_generated_sig_info.js)
-    target_link_libraries(webgpu_dawn INTERFACE ${DAWN_DIR}/third_party/emdawnwebgpu/library_webgpu.js)
-else()
+if(WEBGPU_DAWN_DEBUG OR WEBGPU_DAWN_RELEASE)
+  if(NOT TARGET webgpu_dawn)
+    add_library(webgpu_dawn INTERFACE)
+    target_link_libraries(webgpu_dawn INTERFACE
+      $<$<CONFIG:Debug>:${WEBGPU_DAWN_DEBUG}>
+      $<$<CONFIG:Release>:${WEBGPU_DAWN_RELEASE}>
+    )
+  endif()
 endif()
+elseif(IOS)
+  # On iOS, it is common to build a static library.
+  if(NOT TARGET webgpu_dawn)
+    add_library(webgpu_dawn STATIC IMPORTED)
+    set_target_properties(webgpu_dawn PROPERTIES
+      IMPORTED_LOCATION "${DAWN_BUILD_DIR}/src/dawn/native/webgpu_dawn.a")
+  endif()
+elseif(APPLE)
+  # On macOS (non-iOS), typically a dynamic library (.dylib) is built.
+  if(NOT TARGET webgpu_dawn)
+    add_library(webgpu_dawn SHARED IMPORTED)
+    set_target_properties(webgpu_dawn PROPERTIES
+      IMPORTED_LOCATION "${DAWN_BUILD_DIR}/src/dawn/native/webgpu_dawn.dylib")
+  endif()
+elseif(ANDROID)
+  if(NOT TARGET webgpu_dawn)
+    add_library(webgpu_dawn SHARED IMPORTED)
+    set_target_properties(webgpu_dawn PROPERTIES
+      IMPORTED_LOCATION "${DAWN_BUILD_DIR}/src/dawn/native/webgpu_dawn.so")
+  endif()
+elseif(NOT EMSCRIPTEN)  # For Linux and other Unix-like systems.
+  if(NOT TARGET webgpu_dawn)
+    add_library(webgpu_dawn SHARED IMPORTED)
+    set_target_properties(webgpu_dawn PROPERTIES
+      IMPORTED_LOCATION "${DAWN_BUILD_DIR}/src/dawn/native/webgpu_dawn.so")
+  endif()
+endif()
\ No newline at end of file
diff --git a/cmake/gpu.cmake b/cmake/gpu.cmake
index d991a18..d57f083 100644
--- a/cmake/gpu.cmake
+++ b/cmake/gpu.cmake
@@ -15,7 +15,6 @@ message(STATUS "PROJECT_ROOT: ${PROJECT_ROOT}")
 set(GPU_SOURCES
     "${PROJECT_ROOT}/gpu.cpp"
     "${PROJECT_ROOT}/numeric_types/half.cpp"
-    "${DAWN_BUILD_DIR}/gen/include/dawn/webgpu.h"
 )
 
 # Add headers
diff --git a/gpu.hpp b/gpu.hpp
index 69ed0e9..4a92789 100644
--- a/gpu.hpp
+++ b/gpu.hpp
@@ -195,7 +195,16 @@ struct TensorPool {
 enum NumType {
   kf16, // (experimental)
   kf32,
-  ki32
+  kf64,
+  ki8,
+  ki16,
+  ki32,
+  ki64,
+  ku8,
+  ku16,
+  ku32,
+  ku64,
+  kUnknown
 };
 
 /**
@@ -204,11 +213,27 @@ enum NumType {
 inline size_t sizeBytes(const NumType &type) {
   switch (type) {
   case kf16:
-    return sizeof(uint16_t);
+    return sizeof(half);
   case kf32:
     return sizeof(float);
+  case kf64:
+    return sizeof(double);
+  case ki8:
+    return sizeof(int8_t);
+  case ki16:
+    return sizeof(int16_t);
   case ki32:
     return sizeof(int32_t);
+  case ki64:
+    return sizeof(int64_t);
+  case ku8:
+    return sizeof(uint8_t);
+  case ku16:
+    return sizeof(uint16_t);
+  case ku32:
+    return sizeof(uint32_t);
+  case ku64:
+    return sizeof(uint64_t);
   default:
     LOG(kDefLog, kError, "Invalid NumType in size calculation.");
     return 0;
@@ -224,8 +249,24 @@ inline std::string toString(NumType type) {
     return "f16";
   case kf32:
     return "f32";
+  case kf64:
+    return "f64";
+  case ki8:
+    return "i8";
+  case ki16:
+    return "i16";
   case ki32:
     return "i32";
+  case ki64:
+    return "i64";
+  case ku8:
+    return "u8";
+  case ku16:
+    return "u16";
+  case ku32:
+    return "u32";
+  case ku64:
+    return "u64";
   default:
     LOG(kDefLog, kError, "Invalid NumType in string conversion.");
     return "unknown";
@@ -584,28 +625,46 @@ struct Context {
 
   ~Context() {
     LOG(kDefLog, kTrace, "Destroying context");
+
+#ifdef __EMSCRIPTEN__
+    // For WebAssembly, do NOT call processEvents during destruction
+    // This prevents "Asyncify cannot be done during or after runtime exits"
+    LOG(kDefLog, kTrace,
+        "WebAssembly context destruction - skipping processEvents");
+#endif
+
     if (queue) {
       wgpuQueueRelease(queue);
+      queue = nullptr;
     } else {
-      LOG(kDefLog, kTrace, "Queue is null");
+      LOG(kDefLog, kTrace, "Queue already null");
     }
+
     if (device) {
       wgpuDeviceRelease(device);
-      processEvents(instance);
+      device = nullptr;
     } else {
-      LOG(kDefLog, kTrace, "Device is null");
+      LOG(kDefLog, kTrace, "Device already null");
     }
+
     if (adapter) {
       wgpuAdapterRelease(adapter);
-      processEvents(instance);
+      adapter = nullptr;
     } else {
-      LOG(kDefLog, kTrace, "Adapter is null");
+      LOG(kDefLog, kTrace, "Adapter already null");
     }
+
     if (instance) {
+#ifndef __EMSCRIPTEN__
+      // Only call processEvents on native platforms during cleanup
+      processEvents(instance);
+#endif
       wgpuInstanceRelease(instance);
+      instance = nullptr;
     } else {
-      LOG(kDefLog, kTrace, "Instance is null");
+      LOG(kDefLog, kTrace, "Instance already null");
     }
+
     LOG(kDefLog, kTrace, "Context destroyed");
   }
 };
@@ -693,6 +752,18 @@ inline Tensor createTensor(Context &ctx, const Shape &shape, NumType dtype) {
  * Tensor tensor = createTensor(ctx, {256, 256}, kf32, data);
  * @endcode
  */
+inline Tensor createTensor(Context &ctx, const Shape &shape, NumType dtype,
+                           const half *data) {
+  assert(dtype == kf16);
+  Tensor tensor =
+      createTensor(ctx.pool, ctx.device, shape, dtype,
+                   WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst |
+                       WGPUBufferUsage_CopySrc);
+  wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data,
+                       tensor.data.size);
+  return tensor;
+}
+
 inline Tensor createTensor(Context &ctx, const Shape &shape, NumType dtype,
                            const float *data) {
   assert(dtype == kf32);
@@ -705,6 +776,32 @@ inline Tensor createTensor(Context &ctx, const Shape &shape, NumType dtype,
   return tensor;
 }
 
+// Overload for double: pack each double into a float (losing precision)
+inline Tensor createTensor(Context &ctx, const Shape &shape, NumType dtype,
+                           const double *data) {
+  assert(dtype == kf64);
+  size_t numElements = size(shape);
+  // Each double (8 bytes) will be packed into 2 uint32_t values (2×4 bytes).
+  std::vector<uint32_t> packed(numElements * 2);
+  for (size_t i = 0; i < numElements; ++i) {
+    uint64_t bits;
+    std::memcpy(&bits, &data[i], sizeof(double)); // Extract raw bits.
+    packed[2 * i] = static_cast<uint32_t>(bits & 0xFFFFFFFF);
+    packed[2 * i + 1] = static_cast<uint32_t>(bits >> 32);
+  }
+  // Create a tensor using the core overload that accepts a TensorPool and
+  // WGPUDevice.
+  Tensor tensor =
+      createTensor(ctx.pool, ctx.device, shape, kf64,
+                   WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst |
+                       WGPUBufferUsage_CopySrc);
+
+  wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, packed.data(),
+                       packed.size() * sizeof(uint32_t));
+
+  return tensor;
+}
+
 inline Tensor createTensor(Context &ctx, const Shape &shape, NumType dtype,
                            const int32_t *data) {
   assert(dtype == ki32);
@@ -717,27 +814,55 @@ inline Tensor createTensor(Context &ctx, const Shape &shape, NumType dtype,
   return tensor;
 }
 
-/**
- * @brief Overload of the tensor factory function to instantiate a tensor on
- * the GPU with a given shape, data type. This overload also takes initial
- * half* data to populate the tensor with.
- *
- * The data is assumed to be of size equal to the product of the dimensions in
- * the shape, and is copied to the GPU buffer.
- *
- * @param[in] ctx Context instance to manage the tensor
- * @param[in] shape Shape of the tensor
- * @param[in] dtype Data type of the tensor (e.g. kf32)
- * @param[in] data Initial data to populate the tensor with
- * @return Tensor instance representing the created tensor
- *
- * @code
- * Tensor tensor = createTensor(ctx, {256, 256}, kf32, data);
- * @endcode
- */
+// Overload for int8_t: pack four 8‑bit ints into one 32‑bit integer
 inline Tensor createTensor(Context &ctx, const Shape &shape, NumType dtype,
-                           const half *data) {
-  assert(dtype == kf16);
+                           const int8_t *data) {
+  assert(dtype == ki8); // unsupported: pack into ki32
+  size_t numElements = size(shape);
+  size_t packedCount = (numElements + 3) / 4;
+  std::vector<int32_t> packed(packedCount, 0);
+  for (size_t i = 0; i < numElements; ++i) {
+    size_t idx = i / 4;
+    size_t shift = (i % 4) * 8;
+    // pack as unsigned bits then reinterpret; shader is then responsible for
+    // unpacking
+    packed[idx] |= (static_cast<uint8_t>(data[i]) << shift);
+  }
+  return createTensor(ctx, shape, ki32, packed.data());
+}
+
+// Overload for int16_t: pack two 16‑bit ints into one 32‑bit integer
+inline Tensor createTensor(Context &ctx, const Shape &shape, NumType dtype,
+                           const int16_t *data) {
+  assert(dtype == ki16); // unsupported: pack into ki32
+  size_t numElements = size(shape);
+  size_t packedCount = (numElements + 1) / 2;
+  std::vector<int32_t> packed(packedCount, 0);
+  for (size_t i = 0; i < numElements; ++i) {
+    size_t idx = i / 2;
+    size_t shift = (i % 2) * 16;
+    packed[idx] |= (static_cast<uint16_t>(data[i]) << shift);
+  }
+  return createTensor(ctx, shape, ki32, packed.data());
+}
+
+// Overload for int64_t: pack each 64‑bit int into two 32‑bit integers
+inline Tensor createTensor(Context &ctx, const Shape &shape, NumType dtype,
+                           const int64_t *data) {
+  assert(dtype == ki64); // unsupported: pack into two ki32s
+  size_t numElements = size(shape);
+  std::vector<int32_t> packed(numElements * 2);
+  for (size_t i = 0; i < numElements; ++i) {
+    int64_t val = data[i];
+    packed[2 * i] = static_cast<int32_t>(val & 0xFFFFFFFF);
+    packed[2 * i + 1] = static_cast<int32_t>((val >> 32) & 0xFFFFFFFF);
+  }
+  return createTensor(ctx, shape, ki32, packed.data());
+}
+
+inline Tensor createTensor(Context &ctx, const Shape &shape, NumType dtype,
+                           const uint32_t *data) {
+  assert(dtype == ku32);
   Tensor tensor =
       createTensor(ctx.pool, ctx.device, shape, dtype,
                    WGPUBufferUsage_Storage | WGPUBufferUsage_CopyDst |
@@ -747,6 +872,53 @@ inline Tensor createTensor(Context &ctx, const Shape &shape, NumType dtype,
   return tensor;
 }
 
+// Overload for uint8_t: pack four 8‑bit integers into one 32‑bit unsigned
+// integer
+inline Tensor createTensor(Context &ctx, const Shape &shape, NumType dtype,
+                           const uint8_t *data) {
+  assert(dtype == ku8); // unsupported: pack into ku32
+  size_t numElements = size(shape);
+  size_t packedCount = (numElements + 3) / 4;
+  std::vector<uint32_t> packed(packedCount, 0);
+  for (size_t i = 0; i < numElements; ++i) {
+    size_t idx = i / 4;
+    size_t shift = (i % 4) * 8;
+    packed[idx] |= (static_cast<uint32_t>(data[i]) << shift);
+  }
+  return createTensor(ctx, shape, ku32, packed.data());
+}
+
+// Overload for uint16_t: pack two 16‑bit integers into one 32‑bit unsigned
+// integer
+inline Tensor createTensor(Context &ctx, const Shape &shape, NumType dtype,
+                           const uint16_t *data) {
+  assert(dtype == ku16); // unsupported: pack into ku32
+  size_t numElements = size(shape);
+  size_t packedCount = (numElements + 1) / 2;
+  std::vector<uint32_t> packed(packedCount, 0);
+  for (size_t i = 0; i < numElements; ++i) {
+    size_t idx = i / 2;
+    size_t shift = (i % 2) * 16;
+    packed[idx] |= (static_cast<uint32_t>(data[i]) << shift);
+  }
+  return createTensor(ctx, shape, ku32, packed.data());
+}
+
+// Overload for uint64_t: pack each 64‑bit integer into two 32‑bit unsigned
+// integers
+inline Tensor createTensor(Context &ctx, const Shape &shape, NumType dtype,
+                           const uint64_t *data) {
+  assert(dtype == ku64); // unsupported: pack into two ku32s
+  size_t numElements = size(shape);
+  std::vector<uint32_t> packed(numElements * 2);
+  for (size_t i = 0; i < numElements; ++i) {
+    uint64_t val = data[i];
+    packed[2 * i] = static_cast<uint32_t>(val & 0xFFFFFFFF);
+    packed[2 * i + 1] = static_cast<uint32_t>(val >> 32);
+  }
+  return createTensor(ctx, shape, ku32, packed.data());
+}
+
 /**
  * @brief Frees a tensor resource and updates the tensor pool.
  *
@@ -829,21 +1001,63 @@ inline void check(bool condition, const char *message,
  * devDescriptor); WGPUDevice device = wait(instance, deviceFuture);
  * @endcode
  */
+#ifdef __EMSCRIPTEN__
+// Global flag to prevent overlapping async operations in WebAssembly
+static std::atomic<bool> asyncOperationInProgress{false};
+#endif
+
 template <typename T> T wait(Context &ctx, std::future<T> &f) {
 #ifdef __EMSCRIPTEN__
-  // Poll until the future is ready.
-  while (f.wait_for(std::chrono::milliseconds(0)) !=
-         std::future_status::ready) {
-    // Yield control to the JS event loop.
-    emscripten_sleep(1);
+  // Check if another async operation is in progress
+  if (asyncOperationInProgress.load()) {
+    LOG(kDefLog, kWarn,
+        "wait(): Another async operation in progress, skipping wait");
+    if constexpr (std::is_void_v<T>) {
+      return; // For void functions, just return
+    } else {
+      return T{}; // Return default-constructed value for non-void types
+    }
+  }
+
+  // Set the flag before starting async operation
+  asyncOperationInProgress.store(true);
+
+  try {
+    // Poll until the future is ready
+    while (f.wait_for(std::chrono::milliseconds(0)) !=
+           std::future_status::ready) {
+      emscripten_sleep(1);
+    }
+
+    // Handle void vs non-void return types
+    if constexpr (std::is_void_v<T>) {
+      f.get(); // Just call get() without storing result
+      asyncOperationInProgress.store(false);
+      return; // void return
+    } else {
+      T result = f.get();
+      asyncOperationInProgress.store(false);
+      return result;
+    }
+
+  } catch (...) {
+    asyncOperationInProgress.store(false);
+    throw;
   }
-  return f.get();
 #else
+  // Native implementation unchanged
   while (f.wait_for(std::chrono::milliseconds(0)) !=
          std::future_status::ready) {
     wgpuInstanceProcessEvents(ctx.instance);
   }
-  return f.get();
+
+  // Handle void vs non-void for native too
+  if constexpr (std::is_void_v<T>) {
+    f.get();
+    return;
+  } else {
+    return f.get();
+  }
 #endif
 }
 
@@ -869,7 +1083,8 @@ template <typename T> T wait(Context &ctx, std::future<T> &f) {
  * Context ctx = waitForContextFuture(contextFuture);
  * @endcode
  */
-template <typename T> T waitForContextFuture(std::future<T> &f, size_t sleepTime = 10) {
+template <typename T>
+T waitForContextFuture(std::future<T> &f, size_t sleepTime = 10) {
 #ifdef __EMSCRIPTEN__
   while (f.wait_for(std::chrono::milliseconds(0)) !=
          std::future_status::ready) {
@@ -1358,13 +1573,15 @@ inline void queueWorkDoneCallback(WGPUQueueWorkDoneStatus status,
   WGPUBufferMapCallbackInfo mapCallbackInfo = {
       .mode = WGPUCallbackMode_AllowSpontaneous,
       .callback = bufferMapCallback,
-      .userdata1 = const_cast<CallbackData *>(cbData), // Pass the callback data.
-      .userdata2 = nullptr // No additional user data.
+      .userdata1 =
+          const_cast<CallbackData *>(cbData), // Pass the callback data.
+      .userdata2 = nullptr                    // No additional user data.
   };
 
   // Begin the asynchronous mapping of the readback buffer.
   wgpuBufferMapAsync(cbData->buffer, WGPUMapMode_Read, 0, cbData->bufferSize,
                      mapCallbackInfo);
+  wgpuBufferRelease(cbData->buffer);
 }
 
 /**
@@ -1639,6 +1856,251 @@ inline void toCPU(Context &ctx, Tensor &tensor, std::array<float, N> &data,
   wait(ctx, future);
 }
 
+inline void toCPU(Context &ctx, Tensor &tensor, NumType dtype, void *output,
+                  size_t sourceOffset = 0) {
+  size_t numElements = size(tensor.shape);
+  switch (dtype) {
+  // These types are directly supported.
+  case kf16:
+  case kf32:
+  case ku32:
+  case ki32:
+    toCPU(ctx, tensor, output, tensor.data.size, sourceOffset);
+    break;
+
+  // kf64 to reverse bit‐packing of doubles.
+  case kf64: {
+    // We expect each double to have been packed into 2 uint32_t values.
+    std::vector<uint32_t> tmp(numElements * 2);
+    // Read the packed data (each element is 4 bytes)
+    toCPU(ctx, tensor, tmp.data(), tmp.size() * sizeof(uint32_t), sourceOffset);
+    double *dst = static_cast<double *>(output);
+    for (size_t i = 0; i < numElements; ++i) {
+      uint32_t low = tmp[2 * i];
+      uint32_t high = tmp[2 * i + 1];
+      // Reassemble the 64-bit raw representation.
+      uint64_t bits = (static_cast<uint64_t>(high) << 32) | low;
+      // Copy the raw bits into a double.
+      double d;
+      std::memcpy(&d, &bits, sizeof(double));
+      dst[i] = d;
+    }
+    break;
+  }
+
+  // For int8_t: four 8‑bit ints packed into one int32_t.
+  case ki8: {
+    size_t packedCount = (numElements + 3) / 4;
+    std::vector<int32_t> tmp(packedCount);
+    toCPU(ctx, tensor, tmp.data(), tmp.size() * sizeof(int32_t), sourceOffset);
+    int8_t *dst = static_cast<int8_t *>(output);
+    for (size_t i = 0; i < numElements; ++i) {
+      size_t idx = i / 4;
+      size_t shift = (i % 4) * 8;
+      dst[i] = static_cast<int8_t>((tmp[idx] >> shift) & 0xFF);
+    }
+    break;
+  }
+
+  // For int16_t: two 16‑bit ints packed into one int32_t.
+  case ki16: {
+    size_t packedCount = (numElements + 1) / 2;
+    std::vector<int32_t> tmp(packedCount);
+    toCPU(ctx, tensor, tmp.data(), tmp.size() * sizeof(int32_t), sourceOffset);
+    int16_t *dst = static_cast<int16_t *>(output);
+    for (size_t i = 0; i < numElements; ++i) {
+      size_t idx = i / 2;
+      size_t shift = (i % 2) * 16;
+      dst[i] = static_cast<int16_t>((tmp[idx] >> shift) & 0xFFFF);
+    }
+    break;
+  }
+
+  // For int64_t: each 64‑bit int was packed into two int32_t.
+  case ki64: {
+    std::vector<int32_t> tmp(numElements * 2);
+    toCPU(ctx, tensor, tmp.data(), tmp.size() * sizeof(int32_t), sourceOffset);
+    int64_t *dst = static_cast<int64_t *>(output);
+    for (size_t i = 0; i < numElements; ++i) {
+      int32_t low = tmp[2 * i];
+      int32_t high = tmp[2 * i + 1];
+      dst[i] =
+          (static_cast<int64_t>(high) << 32) | (static_cast<uint32_t>(low));
+    }
+    break;
+  }
+
+  // For uint8_t: four 8‑bit uints packed into one uint32_t.
+  case ku8: {
+    size_t packedCount = (numElements + 3) / 4;
+    std::vector<uint32_t> tmp(packedCount);
+    toCPU(ctx, tensor, tmp.data(), tmp.size() * sizeof(uint32_t), sourceOffset);
+    uint8_t *dst = static_cast<uint8_t *>(output);
+    for (size_t i = 0; i < numElements; ++i) {
+      size_t idx = i / 4;
+      size_t shift = (i % 4) * 8;
+      dst[i] = static_cast<uint8_t>((tmp[idx] >> shift) & 0xFF);
+    }
+    break;
+  }
+
+  // For uint16_t: two 16‑bit uints packed into one uint32_t.
+  case ku16: {
+    size_t packedCount = (numElements + 1) / 2;
+    std::vector<uint32_t> tmp(packedCount);
+    toCPU(ctx, tensor, tmp.data(), tmp.size() * sizeof(uint32_t), sourceOffset);
+    uint16_t *dst = static_cast<uint16_t *>(output);
+    for (size_t i = 0; i < numElements; ++i) {
+      size_t idx = i / 2;
+      size_t shift = (i % 2) * 16;
+      dst[i] = static_cast<uint16_t>((tmp[idx] >> shift) & 0xFFFF);
+    }
+    break;
+  }
+
+  // For uint64_t: each 64‑bit unsigned int was packed into two uint32_t.
+  case ku64: {
+    std::vector<uint32_t> tmp(numElements * 2);
+    toCPU(ctx, tensor, tmp.data(), tmp.size() * sizeof(uint32_t), sourceOffset);
+    uint64_t *dst = static_cast<uint64_t *>(output);
+    for (size_t i = 0; i < numElements; ++i) {
+      uint32_t low = tmp[2 * i];
+      uint32_t high = tmp[2 * i + 1];
+      dst[i] = (static_cast<uint64_t>(high) << 32) | low;
+    }
+    break;
+  }
+
+  default:
+    LOG(kDefLog, kError, "Unsupported dtype in toCPUUnpack");
+    break;
+  }
+}
+
+inline void toCPU(Context &ctx, WGPUBuffer buffer, NumType dtype, void *output,
+                  size_t numElements, size_t sourceOffset = 0) {
+  switch (dtype) {
+  // Directly supported types.
+  case kf16:
+  case kf32:
+  case ku32:
+  case ki32: {
+    size_t byteSize = numElements * sizeBytes(dtype);
+    toCPU(ctx, buffer, output, byteSize, sourceOffset);
+    break;
+  }
+
+  // kf64 to reverse bit‐packing of doubles.
+  case kf64: {
+    // We expect each double to have been packed into 2 uint32_t values.
+    std::vector<uint32_t> tmp(numElements * 2);
+    // Read the packed data (each element is 4 bytes)
+    toCPU(ctx, buffer, tmp.data(), tmp.size() * sizeof(uint32_t), sourceOffset);
+    double *dst = static_cast<double *>(output);
+    for (size_t i = 0; i < numElements; ++i) {
+      uint32_t low = tmp[2 * i];
+      uint32_t high = tmp[2 * i + 1];
+      // Reassemble the 64-bit raw representation.
+      uint64_t bits = (static_cast<uint64_t>(high) << 32) | low;
+      // Copy the raw bits into a double.
+      double d;
+      std::memcpy(&d, &bits, sizeof(double));
+      dst[i] = d;
+    }
+    break;
+  }
+
+  // For int8_t: four 8‑bit ints packed into one int32_t.
+  case ki8: {
+    size_t packedCount = (numElements + 3) / 4;
+    std::vector<int32_t> tmp(packedCount);
+    toCPU(ctx, buffer, tmp.data(), tmp.size() * sizeof(int32_t), sourceOffset);
+    int8_t *dst = static_cast<int8_t *>(output);
+    for (size_t i = 0; i < numElements; ++i) {
+      size_t idx = i / 4;
+      size_t shift = (i % 4) * 8;
+      dst[i] = static_cast<int8_t>((tmp[idx] >> shift) & 0xFF);
+    }
+    break;
+  }
+
+  // For int16_t: two 16‑bit ints packed into one int32_t.
+  case ki16: {
+    size_t packedCount = (numElements + 1) / 2;
+    std::vector<int32_t> tmp(packedCount);
+    toCPU(ctx, buffer, tmp.data(), packedCount * sizeof(int32_t), sourceOffset);
+    int16_t *dst = static_cast<int16_t *>(output);
+    for (size_t i = 0; i < numElements; ++i) {
+      size_t idx = i / 2;
+      size_t shift = (i % 2) * 16;
+      dst[i] = static_cast<int16_t>((tmp[idx] >> shift) & 0xFFFF);
+    }
+    break;
+  }
+
+  // For int64_t: each 64‑bit int is packed into two int32_t.
+  case ki64: {
+    std::vector<int32_t> tmp(numElements * 2);
+    toCPU(ctx, buffer, tmp.data(), tmp.size() * sizeof(int32_t), sourceOffset);
+    int64_t *dst = static_cast<int64_t *>(output);
+    for (size_t i = 0; i < numElements; ++i) {
+      int32_t low = tmp[2 * i];
+      int32_t high = tmp[2 * i + 1];
+      dst[i] =
+          (static_cast<int64_t>(high) << 32) | (static_cast<uint32_t>(low));
+    }
+    break;
+  }
+
+  // For uint8_t: four 8‑bit uints packed into one uint32_t.
+  case ku8: {
+    size_t packedCount = (numElements + 3) / 4;
+    std::vector<uint32_t> tmp(packedCount);
+    toCPU(ctx, buffer, tmp.data(), packedCount * sizeof(uint32_t),
+          sourceOffset);
+    uint8_t *dst = static_cast<uint8_t *>(output);
+    for (size_t i = 0; i < numElements; ++i) {
+      size_t idx = i / 4;
+      size_t shift = (i % 4) * 8;
+      dst[i] = static_cast<uint8_t>((tmp[idx] >> shift) & 0xFF);
+    }
+    break;
+  }
+
+  // For uint16_t: two 16‑bit uints packed into one uint32_t.
+  case ku16: {
+    size_t packedCount = (numElements + 1) / 2;
+    std::vector<uint32_t> tmp(packedCount);
+    toCPU(ctx, buffer, tmp.data(), packedCount * sizeof(uint32_t),
+          sourceOffset);
+    uint16_t *dst = static_cast<uint16_t *>(output);
+    for (size_t i = 0; i < numElements; ++i) {
+      size_t idx = i / 2;
+      size_t shift = (i % 2) * 16;
+      dst[i] = static_cast<uint16_t>((tmp[idx] >> shift) & 0xFFFF);
+    }
+    break;
+  }
+
+  // For uint64_t: each 64‑bit unsigned int packed into two uint32_t.
+  case ku64: {
+    std::vector<uint32_t> tmp(numElements * 2);
+    toCPU(ctx, buffer, tmp.data(), tmp.size() * sizeof(uint32_t), sourceOffset);
+    uint64_t *dst = static_cast<uint64_t *>(output);
+    for (size_t i = 0; i < numElements; ++i) {
+      uint32_t low = tmp[2 * i];
+      uint32_t high = tmp[2 * i + 1];
+      dst[i] = (static_cast<uint64_t>(high) << 32) | low;
+    }
+    break;
+  }
+
+  default:
+    LOG(kDefLog, kError, "Unsupported dtype in toCPU (raw buffer override)");
+    break;
+  }
+}
+
 /**
  * @brief Copies data from CPU memory to a GPU buffer. The toGPU overloads are
  * effectively a convenience wrapper around the WebGPU API call
@@ -1659,6 +2121,120 @@ inline void toGPU(Context &ctx, const void *data, WGPUBuffer buffer,
   wgpuQueueWriteBuffer(ctx.queue, buffer, 0, data, size);
 }
 
+// Overload for float: directly copy the float data.
+inline void toGPU(Context &ctx, const float *data, WGPUBuffer buffer,
+                  size_t size) {
+  toGPU(ctx, static_cast<const void *>(data), buffer, size);
+}
+
+// Overload for half: directly copy the half data.
+inline void toGPU(Context &ctx, const half *data, WGPUBuffer buffer,
+                  size_t size) {
+  toGPU(ctx, static_cast<const void *>(data), buffer, size);
+}
+
+// Overload for double: bit-pack each double into two 32‑bit unsigned integers.
+inline void toGPU(Context &ctx, const double *data, WGPUBuffer buffer,
+                  size_t size) {
+  // Number of doubles = size / sizeof(double)
+  size_t numElements = size / sizeof(double);
+  std::vector<uint32_t> packed(numElements * 2);
+  for (size_t i = 0; i < numElements; ++i) {
+    uint64_t bits;
+    std::memcpy(&bits, &data[i],
+                sizeof(double)); // Reinterpret double as raw bits.
+    packed[2 * i] = static_cast<uint32_t>(bits & 0xFFFFFFFF);
+    packed[2 * i + 1] = static_cast<uint32_t>(bits >> 32);
+  }
+  toGPU(ctx, packed.data(), buffer, packed.size() * sizeof(uint32_t));
+}
+
+// Overload for int8_t: pack four 8‑bit ints into one 32‑bit integer.
+inline void toGPU(Context &ctx, const int8_t *data, WGPUBuffer buffer,
+                  size_t size) {
+  // Number of int8_t elements equals size (sizeof(int8_t)==1)
+  size_t numElements = size;
+  size_t packedCount = (numElements + 3) / 4;
+  std::vector<int32_t> packed(packedCount, 0);
+  for (size_t i = 0; i < numElements; ++i) {
+    size_t idx = i / 4;
+    size_t shift = (i % 4) * 8;
+    packed[idx] |= (static_cast<uint8_t>(data[i]) << shift);
+    // LOG(kDefLog, kInfo, "toGPU: %d %d %d", data[i], packed[idx], idx);
+  }
+  toGPU(ctx, packed.data(), buffer, packedCount * sizeof(int32_t));
+}
+
+// Overload for int16_t: pack two 16‑bit ints into one 32‑bit integer.
+inline void toGPU(Context &ctx, const int16_t *data, WGPUBuffer buffer,
+                  size_t size) {
+  size_t numElements = size / sizeof(int16_t);
+  size_t packedCount = (numElements + 1) / 2;
+  std::vector<int32_t> packed(packedCount, 0);
+  for (size_t i = 0; i < numElements; ++i) {
+    size_t idx = i / 2;
+    size_t shift = (i % 2) * 16;
+    packed[idx] |= (static_cast<uint16_t>(data[i]) << shift);
+  }
+  toGPU(ctx, packed.data(), buffer, packedCount * sizeof(int32_t));
+}
+
+// Overload for int64_t: pack each 64‑bit int into two 32‑bit integers.
+inline void toGPU(Context &ctx, const int64_t *data, WGPUBuffer buffer,
+                  size_t size) {
+  size_t numElements = size / sizeof(int64_t);
+  std::vector<int32_t> packed(numElements * 2);
+  for (size_t i = 0; i < numElements; ++i) {
+    int64_t val = data[i];
+    packed[2 * i] = static_cast<int32_t>(val & 0xFFFFFFFF);
+    packed[2 * i + 1] = static_cast<int32_t>((val >> 32) & 0xFFFFFFFF);
+  }
+  toGPU(ctx, packed.data(), buffer, packed.size() * sizeof(int32_t));
+}
+
+// Overload for uint8_t: pack four 8‑bit uints into one 32‑bit unsigned integer.
+inline void toGPU(Context &ctx, const uint8_t *data, WGPUBuffer buffer,
+                  size_t size) {
+  size_t numElements = size; // sizeof(uint8_t)==1
+  size_t packedCount = (numElements + 3) / 4;
+  std::vector<uint32_t> packed(packedCount, 0);
+  for (size_t i = 0; i < numElements; ++i) {
+    size_t idx = i / 4;
+    size_t shift = (i % 4) * 8;
+    packed[idx] |= (static_cast<uint32_t>(data[i]) << shift);
+  }
+  toGPU(ctx, packed.data(), buffer, packedCount * sizeof(uint32_t));
+}
+
+// Overload for uint16_t: pack two 16‑bit uints into one 32‑bit unsigned
+// integer.
+inline void toGPU(Context &ctx, const uint16_t *data, WGPUBuffer buffer,
+                  size_t size) {
+  size_t numElements = size / sizeof(uint16_t);
+  size_t packedCount = (numElements + 1) / 2;
+  std::vector<uint32_t> packed(packedCount, 0);
+  for (size_t i = 0; i < numElements; ++i) {
+    size_t idx = i / 2;
+    size_t shift = (i % 2) * 16;
+    packed[idx] |= (static_cast<uint32_t>(data[i]) << shift);
+  }
+  toGPU(ctx, packed.data(), buffer, packedCount * sizeof(uint32_t));
+}
+
+// Overload for uint64_t: pack each 64‑bit uint into two 32‑bit unsigned
+// integers.
+inline void toGPU(Context &ctx, const uint64_t *data, WGPUBuffer buffer,
+                  size_t size) {
+  size_t numElements = size / sizeof(uint64_t);
+  std::vector<uint32_t> packed(numElements * 2);
+  for (size_t i = 0; i < numElements; ++i) {
+    uint64_t val = data[i];
+    packed[2 * i] = static_cast<uint32_t>(val & 0xFFFFFFFF);
+    packed[2 * i + 1] = static_cast<uint32_t>(val >> 32);
+  }
+  toGPU(ctx, packed.data(), buffer, packed.size() * sizeof(uint32_t));
+}
+
 /**
  * @brief Overload of the toGPU function to copy data from CPU memory to a GPU
  * taking a Tensor instance instead of a WGPUBuffer instance.
@@ -1680,22 +2256,105 @@ inline void toGPU(Context &ctx, const half *data, Tensor &tensor) {
                        tensor.data.size);
 }
 
-inline void toGPU(Context &ctx, const int *data, Tensor &tensor) {
-  wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data,
+// Overload for double: bit-pack each double into two 32‑bit unsigned integers.
+inline void toGPU(Context &ctx, const double *data, Tensor &tensor) {
+  size_t numElements = tensor.data.size / sizeof(double);
+  std::vector<uint32_t> packed(numElements * 2);
+  for (size_t i = 0; i < numElements; ++i) {
+    uint64_t bits;
+    std::memcpy(&bits, &data[i],
+                sizeof(double)); // Reinterpret double as raw bits.
+    packed[2 * i] = static_cast<uint32_t>(bits & 0xFFFFFFFF);
+    packed[2 * i + 1] = static_cast<uint32_t>(bits >> 32);
+  }
+  toGPU(ctx, packed.data(), tensor.data.buffer,
+        packed.size() * sizeof(uint32_t));
+}
+
+// Overload for int8_t: pack four 8‑bit integers into one 32‑bit integer
+inline void toGPU(Context &ctx, const int8_t *data, Tensor &tensor) {
+  size_t numElements = size(tensor.shape);
+  size_t packedCount = (numElements + 3) / 4;
+  std::vector<int32_t> packed(packedCount, 0);
+  for (size_t i = 0; i < numElements; ++i) {
+    size_t idx = i / 4;
+    size_t shift = (i % 4) * 8;
+    // Pack as unsigned then reinterpret (shader will unpack)
+    packed[idx] |= (static_cast<uint8_t>(data[i]) << shift);
+  }
+  wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, packed.data(),
                        tensor.data.size);
 }
 
-inline void toGPU(Context &ctx, const float *data, Tensor &tensor,
-                  size_t size) {
-  wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data, size);
+// Overload for int16_t: pack two 16‑bit integers into one 32‑bit integer
+inline void toGPU(Context &ctx, const int16_t *data, Tensor &tensor) {
+  size_t numElements = size(tensor.shape);
+  size_t packedCount = (numElements + 1) / 2;
+  std::vector<int32_t> packed(packedCount, 0);
+  for (size_t i = 0; i < numElements; ++i) {
+    size_t idx = i / 2;
+    size_t shift = (i % 2) * 16;
+    packed[idx] |= (static_cast<uint16_t>(data[i]) << shift);
+  }
+  wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, packed.data(),
+                       tensor.data.size);
 }
 
-inline void toGPU(Context &ctx, const half *data, Tensor &tensor, size_t size) {
-  wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data, size);
+// Overload for int64_t: pack each 64‑bit integer into two 32‑bit integers
+inline void toGPU(Context &ctx, const int64_t *data, Tensor &tensor) {
+  size_t numElements = size(tensor.shape);
+  std::vector<int32_t> packed(numElements * 2);
+  for (size_t i = 0; i < numElements; ++i) {
+    int64_t val = data[i];
+    packed[2 * i] = static_cast<int32_t>(val & 0xFFFFFFFF);
+    packed[2 * i + 1] = static_cast<int32_t>((val >> 32) & 0xFFFFFFFF);
+  }
+  wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, packed.data(),
+                       tensor.data.size);
+}
+
+// Overload for uint8_t: pack four 8‑bit unsigned integers into one 32‑bit
+// unsigned
+inline void toGPU(Context &ctx, const uint8_t *data, Tensor &tensor) {
+  size_t numElements = size(tensor.shape);
+  size_t packedCount = (numElements + 3) / 4;
+  std::vector<uint32_t> packed(packedCount, 0);
+  for (size_t i = 0; i < numElements; ++i) {
+    size_t idx = i / 4;
+    size_t shift = (i % 4) * 8;
+    packed[idx] |= (static_cast<uint32_t>(data[i]) << shift);
+  }
+  wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, packed.data(),
+                       tensor.data.size);
 }
 
-inline void toGPU(Context &ctx, const int *data, Tensor &tensor, size_t size) {
-  wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, data, size);
+// Overload for uint16_t: pack two 16‑bit unsigned integers into one 32‑bit
+// unsigned
+inline void toGPU(Context &ctx, const uint16_t *data, Tensor &tensor) {
+  size_t numElements = size(tensor.shape);
+  size_t packedCount = (numElements + 1) / 2;
+  std::vector<uint32_t> packed(packedCount, 0);
+  for (size_t i = 0; i < numElements; ++i) {
+    size_t idx = i / 2;
+    size_t shift = (i % 2) * 16;
+    packed[idx] |= (static_cast<uint32_t>(data[i]) << shift);
+  }
+  wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, packed.data(),
+                       tensor.data.size);
+}
+
+// Overload for uint64_t: pack each 64‑bit unsigned integer into two 32‑bit
+// unsigned
+inline void toGPU(Context &ctx, const uint64_t *data, Tensor &tensor) {
+  size_t numElements = size(tensor.shape);
+  std::vector<uint32_t> packed(numElements * 2);
+  for (size_t i = 0; i < numElements; ++i) {
+    uint64_t val = data[i];
+    packed[2 * i] = static_cast<uint32_t>(val & 0xFFFFFFFF);
+    packed[2 * i + 1] = static_cast<uint32_t>(val >> 32);
+  }
+  wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, packed.data(),
+                       tensor.data.size);
 }
 
 template <typename Params>
@@ -2199,7 +2858,6 @@ inline std::future<void> dispatchKernelAsync(Context &ctx, Kernel &kernel) {
   workDoneCallbackInfo.userdata1 = reinterpret_cast<void *>(promise);
   workDoneCallbackInfo.userdata2 = nullptr;
 
-  // IMPORTANT: Pass the address of the callback info structure.
   wgpuQueueOnSubmittedWorkDone(ctx.queue, workDoneCallbackInfo);
 
   return future;
diff --git a/numeric_types/half.cpp b/numeric_types/half.cpp
index c183754..e6e8d71 100644
--- a/numeric_types/half.cpp
+++ b/numeric_types/half.cpp
@@ -214,13 +214,7 @@ fn main(
     }
 }
 )";
-  Context ctx = createContext(
-      {}, {},
-      /*device descriptor, enabling f16 in WGSL*/
-      {
-          .requiredFeatureCount = 1,
-          .requiredFeatures = std::array{WGPUFeatureName_ShaderF16}.data(),
-      });
+  Context ctx = createContext();
   static constexpr size_t N = 10000;
   std::array<half, N> inputArr, outputArr;
   for (int i = 0; i < N; ++i) {
@@ -238,7 +232,7 @@ fn main(
   }
 }
 
-int testHalfMain() {
+int testHalf() {
   printf("\nHalf-precision float tests\n==========================\n");
 
   printf("\nRegular values float round trips\n\n");
diff --git a/numeric_types/half.hpp b/numeric_types/half.hpp
index f78e61a..395e257 100644
--- a/numeric_types/half.hpp
+++ b/numeric_types/half.hpp
@@ -7,59 +7,27 @@
 #include <cstdint>
 #include <cstdio>
 
-#ifdef _MSC_VER
-#include <intrin.h>
-
-static inline uint32_t __builtin_clz(uint32_t value)
-{
-  unsigned long leading_zero = 0;
-  if (value == 0)
-  {
-    return 32;
+// Counts leading zeros in a 16-bit number.
+static inline uint16_t half_clz16(uint16_t value) {
+  uint16_t count = 0;
+  // Start at the highest bit (0x8000)
+  for (uint16_t mask = 0x8000; mask; mask >>= 1) {
+      if (value & mask)
+          break;
+      ++count;
   }
-  _BitScanReverse(&leading_zero, value);
-  return 31 - leading_zero;
+  return count;
 }
 
-static inline uint16_t __builtin_clz(uint16_t value)
-{
-  return __builtin_clz(static_cast<uint32_t>(value)) - 16;
-}
-
-static inline uint64_t __builtin_clz(uint64_t value)
-{
-  unsigned long leading_zero = 0;
-  if (value == 0)
-  {
-    return 64;
-  }
-#if defined(_WIN64)
-  _BitScanReverse64(&leading_zero, value);
-  return 63 - leading_zero;
-#else
-  uint32_t high = static_cast<uint32_t>(value >> 32);
-  uint32_t low = static_cast<uint32_t>(value);
-  if (high != 0)
-  {
-    return __builtin_clz(high);
-  }
-  else
-  {
-    return 32 + __builtin_clz(low);
-  }
-#endif
-}
-#endif
-
 struct half;
 static inline half halfFromFloat(float f);
 static inline float halfToFloat(half h);
+int testHalf();
 
 /**
  * Experimental implementation of half-precision 16-bit floating point numbers.
  */
-struct half
-{
+struct half {
   uint16_t data;
 
   // Default constructor
@@ -77,22 +45,19 @@ struct half
   operator uint16_t() const { return data; }
 
   // Overload assignment operator from uint16_t
-  half &operator=(uint16_t value)
-  {
+  half &operator=(uint16_t value) {
     data = value;
     return *this;
   }
 
   // Overload assignment operator from another half
-  half &operator=(const half &other)
-  {
+  half &operator=(const half &other) {
     data = other.data;
     return *this;
   }
 
   // Overload assignment operator from float
-  half &operator=(float value)
-  {
+  half &operator=(float value) {
     data = halfFromFloat(value);
     return *this;
   }
@@ -103,10 +68,8 @@ struct half
  *
  * Based on Mike Acton's half.c implementation.
  */
-half halfFromFloat(float f)
-{
-  union
-  {
+half halfFromFloat(float f) {
+  union {
     float f;
     uint32_t u;
   } floatUnion = {f};
@@ -145,8 +108,7 @@ half halfFromFloat(float f)
   const uint32_t floatMantissa = float32 & FLOAT_MANTISSA_MASK;
 
   // Check for NaN
-  if ((floatExpMasked == FLOAT_EXP_MASK) && (floatMantissa != 0))
-  {
+  if ((floatExpMasked == FLOAT_EXP_MASK) && (floatMantissa != 0)) {
     half result;
     result.data =
         HALF_EXP_MASK | (floatMantissa >> FLOAT_HALF_MANTISSA_POS_OFFSET);
@@ -226,8 +188,7 @@ half halfFromFloat(float f)
  *
  * Based on Mike Acton's half.c implementation.
  */
-float halfToFloat(half h)
-{
+float halfToFloat(half h) {
   // Constants for bit masks, shifts, and biases
   const uint16_t ONE = 0x0001;
   const uint16_t TWO = 0x0002;
@@ -272,7 +233,7 @@ float halfToFloat(half h)
   const uint32_t isNan = isExpFlagged && isMantissaNonZero;
 
   // Handling denormalized numbers
-  const uint16_t halfMantissaLeadingZeros = __builtin_clz(halfMantissa) - 16;
+  const uint16_t halfMantissaLeadingZeros = half_clz16(halfMantissa);
   const uint16_t halfDenormShiftAmount =
       halfMantissaLeadingZeros + HALF_FLOAT_DENORM_SA_OFFSET;
   const uint32_t halfFloatDenormMantissaShiftAmount =
@@ -308,8 +269,7 @@ float halfToFloat(half h)
   const uint32_t result = checkNanResult;
 
   // Reinterpret the uint32_t result as a float using a union
-  union
-  {
+  union {
     uint32_t u;
     float f;
   } floatUnion;
diff --git a/test/test_gpu.cpp b/test/test_gpu.cpp
index b855712..7f07dbf 100644
--- a/test/test_gpu.cpp
+++ b/test/test_gpu.cpp
@@ -1,4 +1,5 @@
 #include "gpu.hpp"
+#include "numeric_types/half.hpp"
 #include <array>
 #include <cassert>
 #include <chrono>
@@ -10,6 +11,114 @@
 using namespace gpu;
 using namespace std::chrono;
 
+// WGSL Kernels
+
+// Kernel to unpack 4x int8 (packed in i32) to 4x int32
+const char *kPackedInt8ToInt32Kernel = R"(
+  @group(0) @binding(0) var<storage, read_write> packed_input: array<i32>;
+  @group(0) @binding(1) var<storage, read_write> unpacked_output: array<i32>;
+  
+  // Function to sign-extend an 8-bit value (represented in the lower bits of an i32)
+  fn sign_extend_i8(val: i32) -> i32 {
+    return (val << 24) >> 24;
+  }
+  
+  @compute @workgroup_size({{workgroupSize}})
+  fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    let packed_idx: u32 = gid.x;
+  
+    // Check bounds for the PACKED input array
+    if (packed_idx >= arrayLength(&packed_input)) {
+      return;
+    }
+  
+    let packed_val = packed_input[packed_idx];
+  
+    // Unpack and write 4 separate i32 values
+    // Ensure the output buffer is large enough (4x the packed size)
+    let base_output_idx = packed_idx * 4u;
+  
+    // Check bounds for the UNPACKED output array (optional but safer)
+    // This assumes arrayLength(&unpacked_output) is at least 4 * arrayLength(&packed_input)
+    if ((base_output_idx + 3u) >= arrayLength(&unpacked_output)) {
+        return; // Avoid out-of-bounds write if something is wrong
+    }
+  
+    unpacked_output[base_output_idx + 0u] = sign_extend_i8((packed_val >> 0u) & 0xFF);
+    unpacked_output[base_output_idx + 1u] = sign_extend_i8((packed_val >> 8u) & 0xFF);
+    unpacked_output[base_output_idx + 2u] = sign_extend_i8((packed_val >> 16u) & 0xFF);
+    unpacked_output[base_output_idx + 3u] = sign_extend_i8((packed_val >> 24u) & 0xFF);
+  }
+  )";
+
+// Kernel to pack 4x int32 back into 1x int32 (taking lower 8 bits)
+const char *kInt32ToPackedInt8Kernel = R"(
+  @group(0) @binding(0) var<storage, read_write> unpacked_input: array<i32>;
+  @group(0) @binding(1) var<storage, read_write> packed_output: array<i32>;
+  
+  @compute @workgroup_size({{workgroupSize}})
+  fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    let packed_idx: u32 = gid.x; // Index for the PACKED output array
+  
+    // Check bounds for the PACKED output array
+     if (packed_idx >= arrayLength(&packed_output)) {
+      return;
+    }
+  
+    let base_input_idx = packed_idx * 4u;
+  
+    // Check bounds for the UNPACKED input array (optional but safer)
+    // Assumes arrayLength(&unpacked_input) is at least 4 * arrayLength(&packed_output)
+     if ((base_input_idx + 3u) >= arrayLength(&unpacked_input)) {
+        // Handle potential error or incomplete data - maybe write 0?
+        packed_output[packed_idx] = 0;
+        return;
+    }
+  
+    // Read 4 separate i32 values
+    let val0 = unpacked_input[base_input_idx + 0u];
+    let val1 = unpacked_input[base_input_idx + 1u];
+    let val2 = unpacked_input[base_input_idx + 2u];
+    let val3 = unpacked_input[base_input_idx + 3u];
+  
+    // Pack the lower 8 bits of each into one i32
+    var packed_result: i32 = 0;
+    packed_result = packed_result | ((val0 & 0xFF) << 0u);
+    packed_result = packed_result | ((val1 & 0xFF) << 8u);
+    packed_result = packed_result | ((val2 & 0xFF) << 16u);
+    packed_result = packed_result | ((val3 & 0xFF) << 24u);
+  
+    packed_output[packed_idx] = packed_result;
+  }
+  )";
+
+// Simple addition kernel for i32
+const char *kSimpleAddKernelI32 = R"(
+  @group(0) @binding(0) var<storage, read_write> a: array<{{precision}}>;
+  @group(0) @binding(1) var<storage, read_write> b: array<{{precision}}>;
+  @group(0) @binding(2) var<storage, read_write> c: array<{{precision}}>;
+  
+  @compute @workgroup_size({{workgroupSize}})
+  fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    let i: u32 = gid.x;
+    if (i < arrayLength(&a)) {
+      c[i] = a[i] + b[i];
+    }
+  }
+  )";
+
+// A simple WGSL copy kernel that copies input to output.
+static const char *kCopyKernel = R"(
+  @group(0) @binding(0) var<storage, read_write> inp: array<{{precision}}>;
+  @group(0) @binding(1) var<storage, read_write> out: array<{{precision}}>;
+  @compute @workgroup_size({{workgroupSize}})
+  fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    let i: u32 = gid.x;
+    if (i < arrayLength(&inp)) {
+      out[i] = inp[i];
+    }
+  }
+  )";
 
 // Forward declarations:
 void testToCPUWithTensor();
@@ -17,33 +126,649 @@ void testToCPUWithBuffer();
 void testToCPUWithTensorSourceOffset();
 void testToCPUWithBufferSourceOffset();
 void stressTestToCPU();
+void testToCPUWithHalf();
+void testToCPUWithFloat();
+void testToCPUWithDouble();
+void testToCPUWithint8();
+void testToCPUWithint16();
+void testToCPUWithint();
+void testToCPUWithint64();
+void testToCPUWithUint8();
+void testToCPUWithUint16();
+void testToCPUWithUint32();
+void testToCPUWithUint64();
+void testNumTypeSizes();
+void testToCPUUnpack();
+void testCopyShaderPackedUnpack_int8();
+void testAddKernelInt8();
 
 int main() {
   LOG(kDefLog, kInfo, "Running GPU integration tests...");
+  testAddKernelInt8();
+  testCopyShaderPackedUnpack_int8();
+  testToCPUUnpack();
   testToCPUWithTensor();
   testToCPUWithBuffer();
   testToCPUWithTensorSourceOffset();
   testToCPUWithBufferSourceOffset();
+  testToCPUWithHalf();
+  testToCPUWithFloat();
+  testToCPUWithDouble();
+  testToCPUWithint8();
+  testToCPUWithint16();
+  testToCPUWithint();
+  testToCPUWithint64();
+  testToCPUWithUint8();
+  testToCPUWithUint16();
+  testToCPUWithUint32();
+  testToCPUWithUint64();
+  testNumTypeSizes();
   stressTestToCPU();
+  testHalf();
   LOG(kDefLog, kInfo, "All tests passed.");
   return 0;
 }
 
+void testAddKernelInt8() {
+  LOG(kDefLog, kInfo, "Running testAddKernelInt8 (with conversion kernels)...");
 
-// A simple WGSL copy kernel that copies input to output.
-static const char *kCopyKernel = R"(
-@group(0) @binding(0) var<storage, read_write> inp: array<{{precision}}>;
-@group(0) @binding(1) var<storage, read_write> out: array<{{precision}}>;
-@group(0) @binding(1) var<storage, read_write> dummy: array<{{precision}}>;
-@compute @workgroup_size({{workgroupSize}})
-fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
-  let i: u32 = gid.x;
-  if (i < arrayLength(&inp)) {
-    out[i] = inp[i];
+#ifdef USE_DAWN_API
+  Context ctx = createContextByGpuIdx(0);
+#else
+  Context ctx = createContext();
+#endif
+
+  constexpr size_t N = 1024; // Logical number of int8 elements
+  std::vector<int8_t> aInput(N), bInput(N), result(N);
+  std::vector<int8_t> expected(N);
+
+  // CPU Data Setup
+  for (size_t i = 0; i < N; ++i) {
+    // Values in range [-10, 9]
+    aInput[i] = static_cast<int8_t>((i % 20) - 10);
+    bInput[i] = static_cast<int8_t>(((2 * i) % 20) - 10);
+    // Compute expected as int then cast back.
+    int temp = static_cast<int>(aInput[i]) + static_cast<int>(bInput[i]);
+    expected[i] = static_cast<int8_t>(temp);
+    result[i] = 0;
+  }
+
+  // These store the int8 data packed into i32 format on the GPU
+  Tensor aTensorPacked = createTensor(ctx, Shape{N}, ki8, aInput.data());
+  Tensor bTensorPacked = createTensor(ctx, Shape{N}, ki8, bInput.data());
+  // Final output tensor, also in packed format
+  Tensor outputTensorPacked = createTensor(ctx, Shape{N}, ki8);
+
+  // These will hold the data converted to one i32 per original int8 element
+  Tensor aTensorUnpacked = createTensor(ctx, Shape{N}, ki32);
+  Tensor bTensorUnpacked = createTensor(ctx, Shape{N}, ki32);
+  Tensor outputTensorUnpacked =
+      createTensor(ctx, Shape{N}, ki32); // For the simple add result
+
+  constexpr uint32_t workgroupSize = 256;
+  size_t packedCount = (N + 3) / 4; // Number of i32 elements in packed buffers
+  size_t unpackedCount = N; // Number of i32 elements in unpacked buffers
+
+  // Convert Packed Inputs to Unpacked i32
+  Kernel unpackKernelA =
+      createKernel(ctx, {kPackedInt8ToInt32Kernel, workgroupSize, ki32},
+                   Bindings{aTensorPacked, aTensorUnpacked},
+                   {cdiv(packedCount, workgroupSize), 1,
+                    1}); // Dispatch based on packed size
+  Kernel unpackKernelB =
+      createKernel(ctx, {kPackedInt8ToInt32Kernel, workgroupSize, ki32},
+                   Bindings{bTensorPacked, bTensorUnpacked},
+                   {cdiv(packedCount, workgroupSize), 1, 1});
+  // Dispatch based on packed size
+  dispatchKernel(ctx, unpackKernelA);
+  dispatchKernel(ctx, unpackKernelB);
+
+  // Perform Simple Addition on Unpacked i32
+  Kernel simpleAddKernel = createKernel(
+      ctx, {kSimpleAddKernelI32, workgroupSize, ki32},
+      Bindings{aTensorUnpacked, bTensorUnpacked, outputTensorUnpacked},
+      {cdiv(unpackedCount, workgroupSize), 1,
+       1}); // Dispatch based on unpacked size
+  dispatchKernel(ctx, simpleAddKernel);
+
+  // Convert Unpacked i32 Result back to Packed
+  Kernel packKernel =
+      createKernel(ctx, {kInt32ToPackedInt8Kernel, workgroupSize, ki32},
+                   Bindings{outputTensorUnpacked, outputTensorPacked},
+                   {cdiv(packedCount, workgroupSize), 1,
+                    1}); // Dispatch based on packed size
+  dispatchKernel(ctx, packKernel);
+
+  // Copy Final Packed Result to CPU and Unpack
+  // Use the original toCPU for ki8, which handles the final CPU-side unpacking
+  toCPU(ctx, outputTensorPacked, ki8, result.data(), 0);
+
+  for (size_t i = 0; i < N; ++i) {
+    assert(result[i] == expected[i]);
+  }
+
+  LOG(kDefLog, kInfo, "testAddKernelInt8 (with conversion kernels) passed.");
+}
+
+void testCopyShaderPackedUnpack_int8() {
+  LOG(kDefLog, kInfo, "Running testCopyShaderPackedUnpack_int8...");
+
+#ifdef USE_DAWN_API
+  Context ctx = createContextByGpuIdx(0);
+#else
+  Context ctx = createContext();
+#endif
+
+  constexpr size_t N = 1024;
+  std::vector<int8_t> inputData(N), outputData(N);
+  for (size_t i = 0; i < N; ++i) {
+    // Values between -128 and 127.
+    inputData[i] = static_cast<int8_t>((i % 256) - 128);
+  }
+
+  // Create an input tensor using the int8_t overload.
+  // Under the hood the data is packed into int32_t.
+  Tensor inputTensor = createTensor(ctx, Shape{N}, ki8, inputData.data());
+
+  // Create an output tensor of the same shape and unsupported type.
+  Tensor outputTensor = createTensor(ctx, Shape{N}, ki8);
+
+  // Our copy shader (kCopyKernel) expects to work with supported types.
+  // Since int8_t is packed into int32_t, we pass 'ki32' as our shader
+  // precision.
+  Kernel copyKernel =
+      createKernel(ctx, {kCopyKernel, 256, ki32},
+                   Bindings{inputTensor, outputTensor}, {cdiv(N, 256), 1, 1});
+  dispatchKernel(ctx, copyKernel);
+
+  // Now retrieve the output from the GPU and unpack from the packed int32_t
+  // back to int8_t.
+  toCPU(ctx, outputTensor, ki8, outputData.data(), 0);
+
+  // Verify the unpacked data matches the original input.
+  for (size_t i = 0; i < N; ++i) {
+    assert(inputData[i] == outputData[i]);
+  }
+  LOG(kDefLog, kInfo, "testCopyShaderPackedUnpack_int8 passed.");
+}
+
+void testToCPUUnpack() {
+  LOG(kDefLog, kInfo, "Running testToCPUUnpack...");
+
+#ifdef USE_DAWN_API
+  Context ctx = createContextByGpuIdx(0);
+#else
+  Context ctx = createContext();
+#endif
+
+  // Test for double (kf64 -> packed as kf32)
+  {
+    constexpr size_t N = 1024;
+    std::vector<double> inputData(N), outputData(N);
+    for (size_t i = 0; i < N; ++i) {
+      inputData[i] = static_cast<double>(i) * 3.14;
+    }
+    Tensor tensor = createTensor(ctx, Shape{N}, kf64, inputData.data());
+    toCPU(ctx, tensor, kf64, outputData.data(), 0);
+    for (size_t i = 0; i < N; ++i) {
+      // Allow for a very small epsilon error due to float conversion.
+      assert(fabs(inputData[i] - outputData[i]) < 1e-4);
+    }
+    LOG(kDefLog, kInfo, "toCPUUnpack for double passed.");
+  }
+
+  // Test for int8_t (ki8 -> packed as ki32)
+  {
+    constexpr size_t N = 1024;
+    std::vector<int8_t> inputData(N), outputData(N);
+    for (size_t i = 0; i < N; ++i) {
+      inputData[i] = static_cast<int8_t>((i % 256) - 128);
+    }
+    Tensor tensor = createTensor(ctx, Shape{N}, ki8, inputData.data());
+    toCPU(ctx, tensor, ki8, outputData.data(), 0);
+    for (size_t i = 0; i < N; ++i) {
+      assert(inputData[i] == outputData[i]);
+    }
+    LOG(kDefLog, kInfo, "toCPUUnpack for int8_t passed.");
+  }
+
+  // Test for int16_t (ki16 -> packed as ki32)
+  {
+    constexpr size_t N = 1024;
+    std::vector<int16_t> inputData(N), outputData(N);
+    for (size_t i = 0; i < N; ++i) {
+      inputData[i] = static_cast<int16_t>((i % 65536) - 32768);
+    }
+    Tensor tensor = createTensor(ctx, Shape{N}, ki16, inputData.data());
+    toCPU(ctx, tensor, ki16, outputData.data(), 0);
+    for (size_t i = 0; i < N; ++i) {
+      assert(inputData[i] == outputData[i]);
+    }
+    LOG(kDefLog, kInfo, "toCPUUnpack for int16_t passed.");
+  }
+
+  // Test for int64_t (ki64 -> packed as two ki32s)
+  {
+    constexpr size_t N = 1024;
+    std::vector<int64_t> inputData(N), outputData(N);
+    for (size_t i = 0; i < N; ++i) {
+      inputData[i] = static_cast<int64_t>(i) - 512;
+    }
+    Tensor tensor = createTensor(ctx, Shape{N}, ki64, inputData.data());
+    toCPU(ctx, tensor, ki64, outputData.data(), 0);
+    for (size_t i = 0; i < N; ++i) {
+      assert(inputData[i] == outputData[i]);
+    }
+    LOG(kDefLog, kInfo, "toCPUUnpack for int64_t passed.");
+  }
+
+  // Test for uint8_t (ku8 -> packed as ku32)
+  {
+    constexpr size_t N = 1024;
+    std::vector<uint8_t> inputData(N), outputData(N);
+    for (size_t i = 0; i < N; ++i) {
+      inputData[i] = static_cast<uint8_t>(i % 256);
+    }
+    Tensor tensor = createTensor(ctx, Shape{N}, ku8, inputData.data());
+    toCPU(ctx, tensor, ku8, outputData.data(), 0);
+    for (size_t i = 0; i < N; ++i) {
+      assert(inputData[i] == outputData[i]);
+    }
+    LOG(kDefLog, kInfo, "toCPUUnpack for uint8_t passed.");
+  }
+
+  // Test for uint16_t (ku16 -> packed as ku32)
+  {
+    constexpr size_t N = 1024;
+    std::vector<uint16_t> inputData(N), outputData(N);
+    for (size_t i = 0; i < N; ++i) {
+      inputData[i] = static_cast<uint16_t>(i % 65536);
+    }
+    Tensor tensor = createTensor(ctx, Shape{N}, ku16, inputData.data());
+    toCPU(ctx, tensor, ku16, outputData.data(), 0);
+    for (size_t i = 0; i < N; ++i) {
+      assert(inputData[i] == outputData[i]);
+    }
+    LOG(kDefLog, kInfo, "toCPUUnpack for uint16_t passed.");
+  }
+
+  // Test for uint64_t (ku64 -> packed as two ku32s)
+  {
+    constexpr size_t N = 1024;
+    std::vector<uint64_t> inputData(N), outputData(N);
+    for (size_t i = 0; i < N; ++i) {
+      inputData[i] = static_cast<uint64_t>(i) * 123456789ULL;
+    }
+    Tensor tensor = createTensor(ctx, Shape{N}, ku64, inputData.data());
+    toCPU(ctx, tensor, ku64, outputData.data(), 0);
+    for (size_t i = 0; i < N; ++i) {
+      assert(inputData[i] == outputData[i]);
+    }
+    LOG(kDefLog, kInfo, "toCPUUnpack for uint64_t passed.");
+  }
+
+  LOG(kDefLog, kInfo, "All toCPUUnpack tests passed.");
+}
+
+void testNumTypeSizes() {
+  LOG(kDefLog, kInfo, "Running testNumTypeSizes...");
+
+  assert(sizeBytes(kf16) == 2);
+  assert(sizeBytes(kf32) == 4);
+  assert(sizeBytes(ki8) == sizeof(uint8_t));   // typically 1
+  assert(sizeBytes(ki16) == sizeof(uint16_t)); // typically 2
+  assert(sizeBytes(ki32) == sizeof(int32_t));  // typically 4
+  assert(sizeBytes(ku8) == sizeof(uint8_t));   // typically 1
+  assert(sizeBytes(ku16) == sizeof(uint16_t)); // typically 2
+  assert(sizeBytes(ku32) == sizeof(uint32_t)); // typically 4
+
+  LOG(kDefLog, kInfo, "testNumTypeSizes passed.");
+}
+
+// Test using half-precision (16-bit float) data.
+void testToCPUWithHalf() {
+  LOG(kDefLog, kInfo, "Running testToCPUWithHalf...");
+
+#ifdef USE_DAWN_API
+  Context ctx = createContextByGpuIdx(0);
+#else
+  Context ctx = createContext();
+#endif
+
+  constexpr size_t N = 1024;
+  std::array<half, N> inputData, outputData;
+  for (size_t i = 0; i < N; ++i) {
+    // Construct half from float.
+    inputData[i] = half(static_cast<float>(i));
+  }
+
+  Tensor inputTensor = createTensor(ctx, Shape{N}, kf16, inputData.data());
+
+  // Copy GPU output to CPU.
+  toCPU(ctx, inputTensor, outputData.data(), sizeof(outputData));
+
+  // Validate the copy (using float conversion for approximate equality).
+  for (size_t i = 0; i < N; ++i) {
+    float inVal = static_cast<float>(inputData[i]);
+    float outVal = static_cast<float>(outputData[i]);
+    // Use a small epsilon to compare half values.
+    assert(fabs(inVal - outVal) <= 0.01f);
+  }
+  LOG(kDefLog, kInfo, "testToCPUWithHalf passed.");
+}
+
+// Test using float (32-bit) data.
+void testToCPUWithFloat() {
+  LOG(kDefLog, kInfo, "Running testToCPUWithFloat...");
+
+#ifdef USE_DAWN_API
+  Context ctx = createContextByGpuIdx(0);
+#else
+  Context ctx = createContext();
+#endif
+
+  constexpr size_t N = 1024;
+  std::array<float, N> inputData, outputData;
+  for (size_t i = 0; i < N; ++i) {
+    inputData[i] = static_cast<float>(i * 1.5f);
+    outputData[i] = 0.0f;
+  }
+
+  Tensor inputTensor = createTensor(ctx, Shape{N}, kf32, inputData.data());
+
+  // Copy GPU output to CPU.
+  toCPU(ctx, inputTensor, outputData.data(), sizeof(outputData));
+
+  // Validate the copy.
+  for (size_t i = 0; i < N; ++i) {
+    assert(inputData[i] == outputData[i]);
+  }
+  LOG(kDefLog, kInfo, "testToCPUWithFloat passed.");
+}
+
+// Test using double (64-bit floating point) data.
+void testToCPUWithDouble() {
+  LOG(kDefLog, kInfo, "Running testToCPUWithDouble...");
+
+#ifdef USE_DAWN_API
+  Context ctx = createContextByGpuIdx(0);
+#else
+  Context ctx = createContext();
+#endif
+
+  constexpr size_t N = 1024;
+  std::array<double, N> inputData, outputData;
+  for (size_t i = 0; i < N; ++i) {
+    inputData[i] = static_cast<double>(i) * 2.5;
+    outputData[i] = 0.0;
+  }
+
+  Tensor inputTensor = createTensor(ctx, Shape{N}, kf64, inputData.data());
+
+  // Copy GPU output to CPU.
+  toCPU(ctx, inputTensor, outputData.data(), sizeof(outputData));
+
+  // Validate the copy.
+  for (size_t i = 0; i < N; ++i) {
+    assert(inputData[i] == outputData[i]);
+  }
+  LOG(kDefLog, kInfo, "testToCPUWithDouble passed.");
+}
+
+void testToCPUWithint8() {
+  LOG(kDefLog, kInfo, "Running testToCPUWithint8...");
+
+#ifdef USE_DAWN_API
+  Context ctx = createContextByGpuIdx(0);
+#else
+  Context ctx = createContext();
+#endif
+
+  constexpr size_t N = 1024;
+  std::array<int8_t, N> inputData, outputData;
+  // Use a range that includes negative values.
+  for (size_t i = 0; i < N; ++i) {
+    // Values between -128 and 127.
+    inputData[i] = static_cast<int8_t>((i % 256) - 128);
+    outputData[i] = 0;
+  }
+
+  // Create a tensor for int8_t.
+  Tensor inputTensor = createTensor(ctx, Shape{N}, ki8, inputData.data());
+
+  // Synchronously copy the GPU tensor data to CPU.
+  toCPU(ctx, inputTensor, outputData.data(), sizeof(outputData));
+
+  // Validate the copy.
+  for (size_t i = 0; i < N; ++i) {
+    // LOG(kDefLog, kInfo, "inputData[%zu] = %d", i, inputData[i]);
+    // LOG(kDefLog, kInfo, "outputData[%zu] = %d", i, outputData[i]);
+    assert(outputData[i] == inputData[i]);
+  }
+  LOG(kDefLog, kInfo, "testToCPUWithint8 passed.");
+}
+
+// Test using int16_t data.
+void testToCPUWithint16() {
+  LOG(kDefLog, kInfo, "Running testToCPUWithint16...");
+
+#ifdef USE_DAWN_API
+  Context ctx = createContextByGpuIdx(0);
+#else
+  Context ctx = createContext();
+#endif
+
+  constexpr size_t N = 1024;
+  std::array<int16_t, N> inputData, outputData;
+  // Use a range that includes negative values.
+  for (size_t i = 0; i < N; ++i) {
+    // Values between -32768 and 32767.
+    inputData[i] = static_cast<int16_t>((i % 65536) - 32768);
+    outputData[i] = 0;
+  }
+
+  // Create a tensor for int16_t.
+  Tensor inputTensor = createTensor(ctx, Shape{N}, ki16, inputData.data());
+
+  // Synchronously copy the GPU tensor data to CPU.
+  toCPU(ctx, inputTensor, outputData.data(), sizeof(outputData));
+
+  // Validate the copy.
+  for (size_t i = 0; i < N; ++i) {
+    // LOG(kDefLog, kInfo, "inputData[%zu] = %d", i, inputData[i]);
+    // LOG(kDefLog, kInfo, "outputData[%zu] = %d", i, outputData[i]);
+    assert(outputData[i] == inputData[i]);
+  }
+  LOG(kDefLog, kInfo, "testToCPUWithint16 passed.");
+}
+
+// Test using int (int32_t) data.
+void testToCPUWithint() {
+  LOG(kDefLog, kInfo, "Running testToCPUWithint...");
+
+#ifdef USE_DAWN_API
+  Context ctx = createContextByGpuIdx(0);
+#else
+  Context ctx = createContext();
+#endif
+
+  constexpr size_t N = 1024;
+  std::array<int32_t, N> inputData, outputData;
+  // Fill with sample data.
+  for (size_t i = 0; i < N; ++i) {
+    inputData[i] =
+        static_cast<int32_t>(i - 512); // Negative and positive values.
+    outputData[i] = 0;
+  }
+
+  // Create a tensor for int32_t.
+  Tensor inputTensor = createTensor(ctx, Shape{N}, ki32, inputData.data());
+
+  // Synchronously copy the GPU tensor data to CPU.
+  toCPU(ctx, inputTensor, outputData.data(), sizeof(outputData));
+
+  // Validate the copy.
+  for (size_t i = 0; i < N; ++i) {
+    // LOG(kDefLog, kInfo, "inputData[%zu] = %d", i, inputData[i]);
+    // LOG(kDefLog, kInfo, "outputData[%zu] = %d", i, outputData[i]);
+    assert(outputData[i] == inputData[i]);
+  }
+  LOG(kDefLog, kInfo, "testToCPUWithint passed.");
+}
+
+// Test using int64_t (64-bit signed integer) data.
+void testToCPUWithint64() {
+  LOG(kDefLog, kInfo, "Running testToCPUWithint64...");
+
+#ifdef USE_DAWN_API
+  Context ctx = createContextByGpuIdx(0);
+#else
+  Context ctx = createContext();
+#endif
+
+  constexpr size_t N = 1024;
+  std::array<int64_t, N> inputData, outputData;
+  for (size_t i = 0; i < N; ++i) {
+    inputData[i] =
+        static_cast<int64_t>(i) - 512; // Some negative and positive values.
+    outputData[i] = 0;
+  }
+
+  Tensor inputTensor = createTensor(ctx, Shape{N}, ki64, inputData.data());
+
+  // Copy GPU output to CPU.
+  toCPU(ctx, inputTensor, outputData.data(), sizeof(outputData));
+
+  // Validate the copy.
+  for (size_t i = 0; i < N; ++i) {
+    assert(inputData[i] == outputData[i]);
+  }
+  LOG(kDefLog, kInfo, "testToCPUWithint64 passed.");
+}
+
+void testToCPUWithUint8() {
+  LOG(kDefLog, kInfo, "Running testToCPUWithUint8...");
+
+#ifdef USE_DAWN_API
+  Context ctx = createContextByGpuIdx(0);
+#else
+  Context ctx = createContext();
+#endif
+
+  constexpr size_t N = 1024;
+  std::array<uint8_t, N> inputData, outputData;
+  for (size_t i = 0; i < N; ++i) {
+    inputData[i] = static_cast<uint8_t>(i % 256);
+    outputData[i] = 0;
   }
+
+  Tensor inputTensor = createTensor(
+      ctx, Shape{N}, ku8, reinterpret_cast<const uint8_t *>(inputData.data()));
+
+  toCPU(ctx, inputTensor, outputData.data(), sizeof(outputData));
+
+  // Verify the output matches the input.
+  for (size_t i = 0; i < N; ++i) {
+    // LOG(kDefLog, kInfo, "inputData[%zu] = %u", i, inputData[i]);
+    // LOG(kDefLog, kInfo, "outputData[%zu] = %u", i, outputData[i]);
+    assert(outputData[i] == inputData[i]);
+  }
+  LOG(kDefLog, kInfo, "testToCPUWithUint8 passed.");
 }
-)";
 
+void testToCPUWithUint16() {
+  LOG(kDefLog, kInfo, "Running testToCPUWithUint16...");
+
+#ifdef USE_DAWN_API
+  Context ctx = createContextByGpuIdx(0);
+#else
+  Context ctx = createContext();
+#endif
+
+  constexpr size_t N = 1024;
+  std::array<uint16_t, N> inputData, outputData;
+  for (size_t i = 0; i < N; ++i) {
+    inputData[i] = static_cast<uint16_t>(i % 65536);
+    outputData[i] = 0;
+  }
+
+  Tensor inputTensor =
+      createTensor(ctx, Shape{N}, ku16,
+                   reinterpret_cast<const uint16_t *>(inputData.data()));
+
+  // Synchronously copy GPU output to CPU using the tensor overload.
+  toCPU(ctx, inputTensor, outputData.data(), sizeof(outputData));
+
+  // Verify the output matches the input.
+  for (size_t i = 0; i < N; ++i) {
+    // LOG(kDefLog, kInfo, "inputData[%zu] = %u", i, inputData[i]);
+    // LOG(kDefLog, kInfo, "outputData[%zu] = %u", i, outputData[i]);
+    assert(outputData[i] == inputData[i]);
+  }
+  LOG(kDefLog, kInfo, "testToCPUWithUint16 passed.");
+}
+
+void testToCPUWithUint32() {
+  LOG(kDefLog, kInfo, "Running testToCPUWithUint32...");
+
+#ifdef USE_DAWN_API
+  Context ctx = createContextByGpuIdx(0);
+#else
+  Context ctx = createContext();
+#endif
+
+  constexpr size_t N = 1024;
+  std::array<uint32_t, N> inputData, outputData;
+  for (size_t i = 0; i < N; ++i) {
+    inputData[i] = static_cast<uint32_t>(i);
+    outputData[i] = 0;
+  }
+
+  Tensor inputTensor =
+      createTensor(ctx, Shape{N}, ku32,
+                   reinterpret_cast<const uint32_t *>(inputData.data()));
+
+  // Synchronously copy GPU output to CPU using the tensor overload.
+  toCPU(ctx, inputTensor, outputData.data(), sizeof(outputData));
+
+  // Verify the output matches the input.
+  for (size_t i = 0; i < N; ++i) {
+    // LOG(kDefLog, kInfo, "inputData[%zu] = %u", i, inputData[i]);
+    // LOG(kDefLog, kInfo, "outputData[%zu] = %u", i, outputData[i]);
+    assert(outputData[i] == inputData[i]);
+  }
+  LOG(kDefLog, kInfo, "testToCPUWithUint32 passed.");
+}
+
+// Test using uint64_t (64-bit unsigned integer) data.
+void testToCPUWithUint64() {
+  LOG(kDefLog, kInfo, "Running testToCPUWithUint64...");
+
+#ifdef USE_DAWN_API
+  Context ctx = createContextByGpuIdx(0);
+#else
+  Context ctx = createContext();
+#endif
+
+  constexpr size_t N = 1024;
+  std::array<uint64_t, N> inputData, outputData;
+  for (size_t i = 0; i < N; ++i) {
+    inputData[i] = static_cast<uint64_t>(i);
+    outputData[i] = 0;
+  }
+
+  // Assuming a new NumType 'ku64' for 64-bit unsigned integers.
+  Tensor inputTensor = createTensor(ctx, Shape{N}, ku64, inputData.data());
+
+  // Copy GPU output to CPU.
+  toCPU(ctx, inputTensor, outputData.data(), sizeof(outputData));
+
+  // Validate the copy.
+  for (size_t i = 0; i < N; ++i) {
+    assert(inputData[i] == outputData[i]);
+  }
+  LOG(kDefLog, kInfo, "testToCPUWithUint64 passed.");
+}
 
 // Test using the overload that takes a Tensor.
 void testToCPUWithTensor() {
@@ -78,8 +803,8 @@ void testToCPUWithTensor() {
 
   // Verify the output matches the input.
   for (size_t i = 0; i < N; ++i) {
-    LOG(kDefLog, kInfo, "inputData[%zu] = %f", i, inputData[i]);
-    LOG(kDefLog, kInfo, "outputData[%zu] = %f", i, outputData[i]);
+    // LOG(kDefLog, kInfo, "inputData[%zu] = %f", i, inputData[i]);
+    // LOG(kDefLog, kInfo, "outputData[%zu] = %f", i, outputData[i]);
     assert(outputData[i] == inputData[i]);
   }
   LOG(kDefLog, kInfo, "testToCPUWithTensor passed.");
@@ -116,7 +841,7 @@ void testToCPUWithBuffer() {
 
   // Verify that the CPU output matches the original data.
   for (size_t i = 0; i < N; ++i) {
-    LOG(kDefLog, kInfo, "outputData[%zu] = %f", i, outputData[i]);
+    // LOG(kDefLog, kInfo, "outputData[%zu] = %f", i, outputData[i]);
     assert(outputData[i] == data[i]);
   }
   LOG(kDefLog, kInfo, "testToCPUWithBuffer passed.");
@@ -158,8 +883,8 @@ void testToCPUWithTensorSourceOffset() {
   for (size_t i = 0; i < copyCount; ++i) {
     float expected = inputData[sourceOffsetElements + i];
     float actual = cpuOutput[i];
-    LOG(kDefLog, kInfo, "cpuOutput[%zu] = %f", i, actual);
-    LOG(kDefLog, kInfo, "expected[%zu] = %f", i, expected);
+    // LOG(kDefLog, kInfo, "cpuOutput[%zu] = %f", i, actual);
+    // LOG(kDefLog, kInfo, "expected[%zu] = %f", i, expected);
     assert(expected == actual);
   }
   LOG(kDefLog, kInfo, "testToCPUWithTensorSourceOffset passed.");
@@ -201,8 +926,8 @@ void testToCPUWithBufferSourceOffset() {
   for (size_t i = 0; i < copyCount; ++i) {
     float expected = inputData[sourceOffsetElements + i];
     float actual = cpuOutput[i];
-    LOG(kDefLog, kInfo, "cpuOutput[%zu] = %f", i, actual);
-    LOG(kDefLog, kInfo, "expected[%zu] = %f", i, expected);
+    // LOG(kDefLog, kInfo, "cpuOutput[%zu] = %f", i, actual);
+    // LOG(kDefLog, kInfo, "expected[%zu] = %f", i, expected);
     assert(expected == actual);
   }
   LOG(kDefLog, kInfo, "testToCPUWithBufferSourceOffset passed.");
@@ -229,22 +954,24 @@ void stressTestToCPU() {
   auto startTime = high_resolution_clock::now();
   size_t opCount = 0;
   while (high_resolution_clock::now() - startTime < seconds(2)) {
-    // Allocate an output buffer (using a shared_ptr so it stays valid until the future completes)
+    // Allocate an output buffer (using a shared_ptr so it stays valid until the
+    // future completes)
     auto outputData = std::make_shared<std::vector<float>>(N, 0.0f);
     // Use the tensor overload; we’re copying the entire tensor (destOffset = 0)
-    LOG(kDefLog, kInfo, "Copying %zu bytes from GPU to CPU...", N * sizeof(float));
     // log count
-    LOG(kDefLog, kInfo, "opCount = %zu", opCount);
-    auto fut = toCPUAsync(ctx, tensor, outputData->data(), N * sizeof(float), 0);
+    auto fut =
+        toCPUAsync(ctx, tensor, outputData->data(), N * sizeof(float), 0);
     wait(ctx, fut);
     ++opCount;
   }
-  
+
   auto endTime = high_resolution_clock::now();
   auto totalMs = duration_cast<milliseconds>(endTime - startTime).count();
   double throughput = (opCount / (totalMs / 1000.0));
 
-  LOG(kDefLog, kInfo, "Stress test completed:\n"
-            "  %zu GPU to CPU operations in %lld ms\n"
-            "  Throughput: %.2f ops/sec", opCount, totalMs, throughput);
+  LOG(kDefLog, kInfo,
+      "Stress test completed:\n"
+      "  %zu GPU to CPU operations in %lld ms\n"
+      "  Throughput: %.2f ops/sec",
+      opCount, totalMs, throughput);
 }