Fix the size of packed tensor

junjihashimoto · junjihashimoto · commit c2cdcd68096b · 2025-09-12T01:18:17.000+09:00
diff --git a/Makefile b/Makefile
@@ -69,6 +69,9 @@ all: dawnlib check-clang check-linux-vulkan lib pch
 	cd examples/shadertui && make build/shadertui
 	cd examples/transpose && make build/transpose
 
+test-gpu: dawnlib check-clang
+	$(LIBSPEC) && clang++ -std=c++17 -g -fsanitize=address -fno-omit-frame-pointer -Wall $(INCLUDES) test/test_gpu.cpp  numeric_types/half.cpp -L$(LIBDIR) -lwebgpu_dawn  -Wl,-rpath,$(GPUCPP)/third_party/lib -ldl -o build/test_gpu && ./build/test_gpu
+
 # Test 16-bit floating point type
 test-half: dawnlib check-clang
 	$(LIBSPEC) && clang++ -std=c++17 $(INCLUDES) numeric_types/half.cpp -L$(LIBDIR) -lwebgpu_dawn -ldl -o build/half && ./build/half
diff --git a/examples/hello_world/Makefile b/examples/hello_world/Makefile
@@ -23,7 +23,7 @@ build/$(TARGET): run.cpp
 	mkdir -p build && $(CXX) $(FLAGS) -DNO_LOG -o ./build/$(TARGET)
 
 debug: run.cpp
-	mkdir -p build && $(CXX) $(FLAGS) -g -Wall -o ./build/$(TARGET)
+	mkdir -p build && $(CXX) $(FLAGS) -g -fsanitize=address -fno-omit-frame-pointer -Wall -o ./build/$(TARGET)
 
 clean:
 	read -r -p "This will delete the contents of build/*. Are you sure? [CTRL-C to abort] " response && rm -rf build/*
diff --git a/examples/hello_world/run.cpp b/examples/hello_world/run.cpp
@@ -40,6 +40,7 @@ int main(int argc, char **argv) {
   for (int i = 0; i < N; ++i) {
     inputArr[i] = static_cast<float>(i) / 10.0; // dummy input data
   }
+  std::cout << Shape{N} << std::endl;
   Tensor input = createTensor(ctx, Shape{N}, kf32, inputArr.data());
   Tensor output = createTensor(ctx, Shape{N}, kf32);
   Kernel op = createKernel(ctx, {kGelu, 256, kf32},
diff --git a/examples/matmul/Makefile b/examples/matmul/Makefile
@@ -10,11 +10,14 @@ ifeq ($(shell $(CXX) -std=c++17 -x c++ -E -include array - < /dev/null > /dev/nu
 else
     STDLIB := -stdlib=libc++
 endif
-FLAGS=-std=c++17 $(STDLIB) -I$(GPUCPP) -I$(GPUCPP)/third_party/headers -I$(GPUCPP)/third_party/headers/webgpu -L$(GPUCPP)/third_party/lib run.cpp -ldl -lwebgpu_dawn
+FLAGS=-std=c++17 $(STDLIB) -I$(GPUCPP) -I$(GPUCPP)/third_party/headers -I$(GPUCPP)/third_party/headers/webgpu -L$(GPUCPP)/third_party/lib run.cpp -ldl -lwebgpu_dawn -Wl,-rpath,$(GPUCPP)/third_party/lib
 
 run: ./build/$(TARGET)
 	$(LIBSPEC) && ./build/$(TARGET)
 
+debug: run.cpp
+	mkdir -p build && $(CXX) $(FLAGS) -g -fsanitize=address -fno-omit-frame-pointer -Wall -o ./build/$(TARGET)
+
 run_with_metal_profiler: ./build/$(TARGET)_with_metal_profiler
 	$(LIBSPEC) && export METAL_CAPTURE_ENABLED=1 && ./build/$(TARGET)_with_metal_profiler
 
diff --git a/gpu.hpp b/gpu.hpp
@@ -71,6 +71,20 @@ struct Shape {
   }
 };
 
+inline std::ostream& operator<<(std::ostream& os, const Shape& shape)
+{
+  int size = shape.rank;
+  os << "Shape: [";
+  for (int i=0;i<size-1;i++){
+    os << shape.data[i] << ",";
+  }
+  if ( size != 0 ) {
+    os << shape.data[size-1];
+  }
+  os << "]";
+  return os;
+}
+
 /**
  * @brief Returns the number of elements in a tensor with the given shape,
  * which is equal to the product of the dimensions.
@@ -210,30 +224,30 @@ enum NumType {
 /**
  * @brief Returns the number of bytes of a number type.
  */
-inline size_t sizeBytes(const NumType &type) {
+inline size_t sizeBytes(const NumType &type, int numElements = 1) {
   switch (type) {
   case kf16:
-    return sizeof(half);
+    return sizeof(half) * numElements;
   case kf32:
-    return sizeof(float);
+    return sizeof(float) * numElements;
   case kf64:
-    return sizeof(double);
+    return sizeof(double) * numElements;
   case ki8:
-    return sizeof(int8_t);
+    return sizeof(uint32_t) * ((numElements + 3) / 4);
   case ki16:
-    return sizeof(int16_t);
+    return sizeof(uint32_t) * ((numElements + 1) / 2);
   case ki32:
-    return sizeof(int32_t);
+    return sizeof(int32_t) * numElements;
   case ki64:
-    return sizeof(int64_t);
+    return sizeof(int64_t) * numElements;
   case ku8:
-    return sizeof(uint8_t);
+    return sizeof(uint32_t) * ((numElements + 3) / 4);
   case ku16:
-    return sizeof(uint16_t);
+    return sizeof(uint32_t) * ((numElements + 1) / 2);
   case ku32:
-    return sizeof(uint32_t);
+    return sizeof(uint32_t) * numElements;
   case ku64:
-    return sizeof(uint64_t);
+    return sizeof(uint64_t) * numElements;
   default:
     LOG(kDefLog, kError, "Invalid NumType in size calculation.");
     return 0;
@@ -697,7 +711,7 @@ inline Tensor createTensor(TensorPool &pool, WGPUDevice &device,
                                                    WGPUBufferUsage_CopySrc) {
   LOG(kDefLog, kTrace, "Creating tensor");
   size_t numElements = size(shape);
-  size_t size = sizeBytes(dtype) * numElements;
+  size_t size = sizeBytes(dtype, numElements);
   WGPUBufferDescriptor bufferDesc = {
       .label = {.data = nullptr, .length = 0},
       .usage = usage,
@@ -828,7 +842,10 @@ inline Tensor createTensor(Context &ctx, const Shape &shape, NumType dtype,
     // unpacking
     packed[idx] |= (static_cast<uint8_t>(data[i]) << shift);
   }
-  return createTensor(ctx, shape, ki32, packed.data());
+  Tensor tensor = createTensor(ctx, shape, ki8);
+  wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, packed.data(),
+                       tensor.data.size);
+  return tensor;
 }
 
 // Overload for int16_t: pack two 16‑bit ints into one 32‑bit integer
@@ -843,7 +860,10 @@ inline Tensor createTensor(Context &ctx, const Shape &shape, NumType dtype,
     size_t shift = (i % 2) * 16;
     packed[idx] |= (static_cast<uint16_t>(data[i]) << shift);
   }
-  return createTensor(ctx, shape, ki32, packed.data());
+  Tensor tensor = createTensor(ctx, shape, ki16);
+  wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, packed.data(),
+                       tensor.data.size);
+  return tensor;
 }
 
 // Overload for int64_t: pack each 64‑bit int into two 32‑bit integers
@@ -857,7 +877,10 @@ inline Tensor createTensor(Context &ctx, const Shape &shape, NumType dtype,
     packed[2 * i] = static_cast<int32_t>(val & 0xFFFFFFFF);
     packed[2 * i + 1] = static_cast<int32_t>((val >> 32) & 0xFFFFFFFF);
   }
-  return createTensor(ctx, shape, ki32, packed.data());
+  Tensor tensor = createTensor(ctx, shape, ki64);
+  wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, packed.data(),
+                       tensor.data.size);
+  return tensor;
 }
 
 inline Tensor createTensor(Context &ctx, const Shape &shape, NumType dtype,
@@ -885,7 +908,10 @@ inline Tensor createTensor(Context &ctx, const Shape &shape, NumType dtype,
     size_t shift = (i % 4) * 8;
     packed[idx] |= (static_cast<uint32_t>(data[i]) << shift);
   }
-  return createTensor(ctx, shape, ku32, packed.data());
+  Tensor tensor = createTensor(ctx, shape, ku8);
+  wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, packed.data(),
+                       tensor.data.size);
+  return tensor;
 }
 
 // Overload for uint16_t: pack two 16‑bit integers into one 32‑bit unsigned
@@ -901,7 +927,10 @@ inline Tensor createTensor(Context &ctx, const Shape &shape, NumType dtype,
     size_t shift = (i % 2) * 16;
     packed[idx] |= (static_cast<uint32_t>(data[i]) << shift);
   }
-  return createTensor(ctx, shape, ku32, packed.data());
+  Tensor tensor = createTensor(ctx, shape, ku16);
+  wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, packed.data(),
+                       tensor.data.size);
+  return tensor;
 }
 
 // Overload for uint64_t: pack each 64‑bit integer into two 32‑bit unsigned
@@ -916,7 +945,10 @@ inline Tensor createTensor(Context &ctx, const Shape &shape, NumType dtype,
     packed[2 * i] = static_cast<uint32_t>(val & 0xFFFFFFFF);
     packed[2 * i + 1] = static_cast<uint32_t>(val >> 32);
   }
-  return createTensor(ctx, shape, ku32, packed.data());
+  Tensor tensor = createTensor(ctx, shape, ku64);
+  wgpuQueueWriteBuffer(ctx.queue, tensor.data.buffer, 0, packed.data(),
+                       tensor.data.size);
+  return tensor;
 }
 
 /**
@@ -1987,7 +2019,7 @@ inline void toCPU(Context &ctx, WGPUBuffer buffer, NumType dtype, void *output,
   case kf32:
   case ku32:
   case ki32: {
-    size_t byteSize = numElements * sizeBytes(dtype);
+    size_t byteSize = sizeBytes(dtype, numElements);
     toCPU(ctx, buffer, output, byteSize, sourceOffset);
     break;
   }
diff --git a/test/test_gpu.cpp b/test/test_gpu.cpp
@@ -415,11 +415,11 @@ void testNumTypeSizes() {
 
   assert(sizeBytes(kf16) == 2);
   assert(sizeBytes(kf32) == 4);
-  assert(sizeBytes(ki8) == sizeof(uint8_t));   // typically 1
-  assert(sizeBytes(ki16) == sizeof(uint16_t)); // typically 2
+  assert(sizeBytes(ki8) == sizeof(uint32_t));   // ki8 is packed into uint32_t.
+  assert(sizeBytes(ki16) == sizeof(uint32_t)); // ki16 is packed into uint32_t.
   assert(sizeBytes(ki32) == sizeof(int32_t));  // typically 4
-  assert(sizeBytes(ku8) == sizeof(uint8_t));   // typically 1
-  assert(sizeBytes(ku16) == sizeof(uint16_t)); // typically 2
+  assert(sizeBytes(ku8) == sizeof(uint32_t));  // ku8 is packed into uint32_t.
+  assert(sizeBytes(ku16) == sizeof(uint32_t)); // ku16 is packed into uint32_t.
   assert(sizeBytes(ku32) == sizeof(uint32_t)); // typically 4
 
   LOG(kDefLog, kInfo, "testNumTypeSizes passed.");

Original file line number	Diff line number	Diff line change
`@@ -40,6 +40,7 @@ int main(int argc, char **argv) {`
`40`	`40`	`for (int i = 0; i < N; ++i) {`
`41`	`41`	`inputArr[i] = static_cast<float>(i) / 10.0; // dummy input data`
`42`	`42`	`}`
	`43`	`+ std::cout << Shape{N} << std::endl;`
`43`	`44`	`Tensor input = createTensor(ctx, Shape{N}, kf32, inputArr.data());`
`44`	`45`	`Tensor output = createTensor(ctx, Shape{N}, kf32);`
`45`	`46`	`Kernel op = createKernel(ctx, {kGelu, 256, kf32},`