Add utils and re-organize code structures.

lcy-seso · lcy-seso · commit b03225bb4d79 · 2025-02-06T02:14:41.000Z
diff --git a/csrc/dequant.cu b/csrc/dequant.cu
@@ -1,8 +1,8 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "common.h"
 #include "dequant.cuh"
+#include "util/common.h"
 
 namespace vptq {
 
diff --git a/csrc/dequant.cuh b/csrc/dequant.cuh
@@ -1,8 +1,9 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
+
 #pragma once
 
-#include "cuda_utils.cuh"
+#include "util/cuda_utils.cuh"
 
 namespace vptq {
 
diff --git a/csrc/ops.cc b/csrc/ops.cc
@@ -2,7 +2,6 @@
 // Licensed under the MIT License.
 
 /// register bindings for VPTQ APIs in this file. ///
-
 #include <torch/extension.h>
 
 namespace vptq {
diff --git a/csrc/quant_gemv.cu b/csrc/quant_gemv.cu
@@ -1,8 +1,8 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include "common.h"
 #include "quant_gemv.cuh"
+#include "util/common.h"
 
 namespace vptq {
 
diff --git a/csrc/quant_gemv.cuh b/csrc/quant_gemv.cuh
@@ -2,7 +2,8 @@
 // Licensed under the MIT License.
 #pragma once
 
-#include "cuda_utils.cuh"
+#include "util/cuda_utils.cuh"
+#include "util/debug.cuh"
 
 namespace vptq {
 
diff --git a/csrc/quant_gemv_v2.cu b/csrc/quant_gemv_v2.cu
@@ -1,9 +1,27 @@
-#include "common.h"
-#include "dispatch_macros.h"
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
 #include "quant_gemv_v2.cuh"
+#include "util/common.h"
+#include "util/cuda_utils.cuh"
+#include "util/dispatch_macros.h"
 
 namespace vptq {
 
+/**
+ * @brief Quantized GEMV kernel.
+ * @param act The input activations.
+ * @param bias The bias.
+ * @param indices The indices.
+ * @param centroids The codebook for the main vector quantized weights.
+ *        Stored in row-major order. Element type: fp16, bf16.
+ *        Shape: (num_codebooks, num_centroids, vec_len).
+ * @param residual_centroids The residual centroids.
+ * @param scale_weights The scale weights.
+ * @param scale_bias The scale bias.
+ * @param in_features The number of input features.
+ * @param out_features The number of output features.
+ */
 torch::Tensor quant_gemv_v2(
     const torch::Tensor& act, const c10::optional<torch::Tensor>& bias,
     const torch::Tensor& indices, const torch::Tensor& centroids,
@@ -44,6 +62,9 @@ torch::Tensor quant_gemv_v2(
   const int64_t num_centroids = centroids.size(1);
   const int64_t vec_len = centroids.size(2);
 
+  TORCH_CHECK_LT(batch, 16)
+      << "In GEMV, the batch size is suggested to be less than 16.";
+
   TORCH_CHECK_EQ(num_codebooks, 1) << "Only support one codebook.";
 
   TORCH_CHECK(
@@ -60,7 +81,7 @@ torch::Tensor quant_gemv_v2(
   dim3 blocks(batch, num_codebooks, block_z);
   // FIXME(ying): refine the choice of threads in a thread block.
   // For test at the moment.
-  dim3 threads(256, 1, 1);
+  dim3 threads(256, 1, 1);  // four warps in a thread block.
 
   std::cout << "num_codebooks: " << num_codebooks << std::endl
             << "num_centroids: " << num_centroids << std::endl
diff --git a/csrc/quant_gemv_v2.cuh b/csrc/quant_gemv_v2.cuh
@@ -2,7 +2,8 @@
 // Licensed under the MIT License.
 #pragma once
 
-#include "cuda_utils.cuh"
+#include "util/cuda_utils.cuh"
+#include "util/debug.cuh"
 
 namespace vptq {
 
@@ -16,11 +17,6 @@ __global__ void quant_gemv_v2_kernel(
     const DType* const __restrict__ scale_weights,
     const DType* const __restrict__ scale_bias, int64_t in_features,
     int64_t out_features, int64_t vec_len) {
-  if (threadIdx.x == 0 && blockIdx.x == 0 && blockIdx.y == 0 &&
-      blockIdx.z == 0) {
-    printf("quant_gemv_v2_kernel\n");
-  }
-
   return;
 }
 
diff --git a/csrc/util/common.h b/csrc/util/common.h
diff --git a/csrc/util/config.cuh b/csrc/util/config.cuh
@@ -0,0 +1,42 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#if defined(__CUDA_ARCH__)
+  #define HOST_DEVICE __forceinline__ __host__ __device__
+  #define DEVICE __forceinline__ __device__
+  #define HOST __forceinline__ __host__
+#else
+  #define HOST_DEVICE inline
+  #define DEVICE inline
+  #define HOST inline
+#endif
+
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+  #define CP_ASYNC_SM80_ENABLED
+#endif
+
+#if defined(USE_ROCM)
+  #include <hip/hip_bf16.h>
+  #include <hip/hip_fp16.h>
+
+  #define VPTQ_LDG(arg) __ldg(arg)
+  #define SHFL_DOWN(val, offset) __shfl_down(val, offset)
+  #define WARP_SIZE warpSize
+
+typedef __hip_bfloat162 __bfloat162;
+typedef __hip_bfloat16 __bfloat16;
+
+#else
+  #include <cuda_bf16.h>
+  #include <cuda_fp16.h>
+
+  #define WARP_SIZE 32
+  #define VPTQ_LDG(arg) *(arg)
+  #define SHFL_DOWN(val, offset) __shfl_down_sync(0xffffffff, val, offset)
+
+typedef __nv_bfloat162 __bfloat162;
+typedef __nv_bfloat16 __bfloat16;
+
+#endif
diff --git a/csrc/util/copy.cuh b/csrc/util/copy.cuh
@@ -0,0 +1,6 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+namespace vptq {
+namespace cutlass_wrapper {}  // namespace cutlass_wrapper
+}  // namespace vptq
diff --git a/csrc/util/cuda_utils.cuh b/csrc/util/cuda_utils.cuh
@@ -2,41 +2,9 @@
 // Licensed under the MIT License.
 #pragma once
 
-#include <ATen/cuda/CUDAContext.h>
-
-#if defined(USE_ROCM)
-  #include <hip/hip_bf16.h>
-  #include <hip/hip_fp16.h>
-
-  #define VPTQ_LDG(arg) __ldg(arg)
-  #define SHFL_DOWN(val, offset) __shfl_down(val, offset)
-  #define WARP_SIZE warpSize
-
-typedef __hip_bfloat162 __bfloat162;
-typedef __hip_bfloat16 __bfloat16;
-
-#else
-  #include <cuda_bf16.h>
-  #include <cuda_fp16.h>
+#include "config.cuh"
 
-  #define WARP_SIZE 32
-  #define VPTQ_LDG(arg) *(arg)
-  #define SHFL_DOWN(val, offset) __shfl_down_sync(0xffffffff, val, offset)
-
-typedef __nv_bfloat162 __bfloat162;
-typedef __nv_bfloat16 __bfloat16;
-
-#endif
-
-#if defined(__CUDA_ARCH__)
-  #define HOST_DEVICE __forceinline__ __host__ __device__
-  #define DEVICE __forceinline__ __device__
-  #define HOST __forceinline__ __host__
-#else
-  #define HOST_DEVICE inline
-  #define DEVICE inline
-  #define HOST inline
-#endif
+#include <ATen/cuda/CUDAContext.h>
 
 namespace vptq {
 
diff --git a/csrc/util/debug.cuh b/csrc/util/debug.cuh
@@ -0,0 +1,35 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+#include "config.cuh"
+
+#include <cuda_runtime_api.h>
+
+namespace vptq {
+
+DEVICE bool block(int bid) {
+  int id =
+      blockIdx.x + blockIdx.y * gridDim.x + blockIdx.z * gridDim.x * gridDim.y;
+  return id == bid;
+}
+
+DEVICE bool thread(int tid, int bid) {
+  int id = threadIdx.x + threadIdx.y * blockDim.x +
+           threadIdx.z * blockDim.x * blockDim.y;
+  return id == tid && block(bid);
+}
+
+// usage, e.g.
+// if (thread(0, 0)) { ... }
+// if (thread(37)) { ... }
+// if (block(0)) { ... }
+
+DEVICE bool thread(int tid) { return thread(tid, 0); }
+
+DEVICE bool thread0() { return thread(0, 0); }
+
+DEVICE bool block0() { return block(0); }
+
+}  // namespace vptq
diff --git a/csrc/util/dispatch_macros.h b/csrc/util/dispatch_macros.h