Add storer and verify loaded codebook.

lcy-seso · lcy-seso · commit ee981fc57a91 · 2025-02-10T03:42:03.000-08:00
diff --git a/csrc/config.cuh b/csrc/config.cuh
@@ -2,7 +2,7 @@
 // Licensed under the MIT License.
 #pragma once
 
-#if defined(__CUDA_ARCH__)
+#if (defined(__CUDA_ARCH__) || defined(USE_ROCM))
   #define HOST_DEVICE __forceinline__ __host__ __device__
   #define DEVICE __forceinline__ __device__
   #define HOST __forceinline__ __host__
@@ -12,30 +12,28 @@
   #define HOST inline
 #endif
 
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
-  #define CP_ASYNC_SM80_ENABLED
-#endif
-
 #if defined(USE_ROCM)
   #include <hip/hip_bf16.h>
   #include <hip/hip_fp16.h>
 
-  #define VPTQ_LDG(arg) __ldg(arg)
-  #define SHFL_DOWN(val, offset) __shfl_down(val, offset)
-  #define WARP_SIZE warpSize
-
 typedef __hip_bfloat162 __bfloat162;
 typedef __hip_bfloat16 __bfloat16;
 
+  #define VPTQ_LDG(arg) __ldg(arg)
+  #define SHFL_DOWN(val, offset) __shfl_down(val, offset)
+  #define WARP_SIZE warpSize
 #else
   #include <cuda_bf16.h>
   #include <cuda_fp16.h>
 
+typedef __nv_bfloat162 __bfloat162;
+typedef __nv_bfloat16 __bfloat16;
+
   #define WARP_SIZE 32
   #define VPTQ_LDG(arg) *(arg)
   #define SHFL_DOWN(val, offset) __shfl_down_sync(0xffffffff, val, offset)
+#endif
 
-typedef __nv_bfloat162 __bfloat162;
-typedef __nv_bfloat16 __bfloat16;
-
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+  #define CP_ASYNC_SM80_ENABLED
 #endif
diff --git a/csrc/copy/copy.cuh b/csrc/copy/copy.cuh
@@ -10,9 +10,12 @@ namespace vptq::copy {
 using namespace cute;
 
 /// TODO(ying); the current implementation supports load row-major data only.
-template <typename DType, const int kThreads, const int64_t kRows,
-          const int64_t kCols, typename Base = AccessInfo<DType>>
+template <typename DType, const int kThreads, const int64_t kRows_,
+          const int64_t kCols_, typename Base = AccessInfo<DType>>
 struct GlobalToSharedLoader : public Base {
+  static constexpr int kRows = kRows_;
+  static constexpr int kCols = kCols_;
+
   DEVICE void operator()(const DType* src_, DType* dst_) {
     int tid = threadIdx.x;
 
@@ -32,12 +35,10 @@ struct GlobalToSharedLoader : public Base {
   }
 
 private:
-  // source
   using GlobalLayout =
       cute::Layout<Shape<Int<kRows>, Int<kCols>>, Stride<Int<kCols>, _1>>;
   GlobalLayout src_layout_;
 
-  // destination
   using SharedLayout =
       cute::Layout<Shape<Int<kRows>, Int<kCols>>, Stride<Int<kCols>, _1>>;
 
@@ -69,4 +70,60 @@ private:
   TiledCopy tiled_copy_;
 };
 
+/// TODO(ying); the current implementation supports load row-major data only.
+template <typename DType, const int kThreads, const int64_t kRows_,
+          const int64_t kCols_, typename Base = AccessInfo<DType>>
+struct SharedToGlobalStorer : public Base {
+  static constexpr int kRows = kRows_;
+  static constexpr int kCols = kCols_;
+
+  DEVICE void operator()(const DType* src_, DType* dst_) {
+    int tid = threadIdx.x;
+
+    auto stile = make_tensor(make_smem_ptr(src_), src_layout_);
+    auto gtile = make_tensor(make_gmem_ptr(dst_), dst_layout_);
+
+    auto loader = tiled_copy_.get_thread_slice(tid);
+
+    auto src = loader.partition_S(stile);
+    auto dst = loader.partition_D(gtile);
+
+#pragma unroll
+    for (int i = 0; i < int(size<1>(src)); ++i)
+#pragma unroll
+      for (int j = 0; j < int(size<2>(src)); ++j)
+        cute::copy(tiled_copy_, src(cute::_, i, j), dst(cute::_, i, j));
+  }
+
+private:
+  using SharedLayout =
+      cute::Layout<Shape<Int<kRows>, Int<kCols>>, Stride<Int<kCols>, _1>>;
+  // using LayoutAtom =
+  //     decltype(composition(cute::Swizzle<2, 3, 3>{},
+  //                          cute::Layout<Shape<_4, _64>, Stride<_64, _1>>{}));
+  // using SharedLayout = decltype(tile_to_shape(
+  //     LayoutAtom{}, Shape<Int<kRows>, Int<kCols>>{}, cute::Step<_2, _1>{}));
+  SharedLayout src_layout_;
+
+  using GlobalLayout =
+      cute::Layout<Shape<Int<kRows>, Int<kCols>>, Stride<Int<kCols>, _1>>;
+  GlobalLayout dst_layout_;
+
+  // tiled copy
+  static constexpr int kThreadCols =
+      kCols * Base::kElementBits / Base::kAccessInBits;
+  static_assert(kThreadCols > 0);
+  static constexpr int kThreadRows = kThreads / kThreadCols;
+
+  using ThreadLayout = cute::Layout<Shape<Int<kThreadRows>, Int<kThreadCols>>,
+                                    Stride<Int<kThreadCols>, _1>>;
+  using ValueLayout = cute::Layout<Shape<_1, _8>>;
+
+  using CopyInst = Copy_Atom<DefaultCopy, DType>;
+
+  using TiledCopy =
+      decltype(make_tiled_copy(CopyInst{}, ThreadLayout{}, ValueLayout{}));
+  TiledCopy tiled_copy_;
+};
+
 }  // namespace vptq::copy
diff --git a/csrc/quant_gemv.cuh b/csrc/quant_gemv.cuh
@@ -2,6 +2,7 @@
 // Licensed under the MIT License.
 #pragma once
 
+#include "util/convert.cuh"
 #include "util/cuda_utils.cuh"
 
 namespace vptq {
@@ -146,7 +147,7 @@ __global__ void WqA16WithOutliers_PackIndice(
 #pragma unroll
   for (int gi = 0; gi < GROUPSIZE; ++gi) {
     float reduce_out = 0.f;
-    reduce_out = cuda::ConvertToFloat(tmp_output[gi]);
+    reduce_out = to_float(tmp_output[gi]);
     reduce_out = cuda::warpReduceSum<WARP_SIZE>(reduce_out);
     if (landid == 0) {
       shared_output[gi][warpid] = reduce_out;
@@ -172,10 +173,10 @@ __global__ void WqA16WithOutliers_PackIndice(
       if (landid == 0 && (in_y * GROUPSIZE + wid) < out_features) {
         if constexpr (Do_Reduce) {
           out[(wid)*gridDim.z] =
-              cuda::ConvertFromFloat<scalar_t>(reduce_out, zero_value) +
+              from_float<scalar_t>(reduce_out, zero_value) +
               ((bidz == 0 && bias != 0) ? bias[wid] : zero_value);
         } else {
-          out[wid] = cuda::ConvertFromFloat<scalar_t>(reduce_out, zero_value) +
+          out[wid] = from_float<scalar_t>(reduce_out, zero_value) +
                      ((bias != 0) ? bias[wid] : zero_value);
         }
       }
diff --git a/csrc/quant_gemv_v2.cu b/csrc/quant_gemv_v2.cu
@@ -17,6 +17,9 @@ struct QuantGemvKeTraits : public Base {
   using LoaderG2S =
       copy::GlobalToSharedLoader<DType, kThreads, kNumCentroids / kPackedVecs,
                                  kVecLen * kPackedVecs>;
+  using StorerS2G =
+      copy::SharedToGlobalStorer<DType, kThreads, kNumCentroids / kPackedVecs,
+                                 kVecLen * kPackedVecs>;
 };
 
 /**
@@ -83,7 +86,11 @@ torch::Tensor quant_gemv_v2(
       "Supported vector length in vectorized quantization: 4, 8, 12, or 16.");
 
   torch::Tensor output;
-  output = at::empty({in_features, out_features}, centroids.options());
+  // output = at::empty({in_features, out_features}, centroids.options());
+
+  // NOTE: this is for test!!!
+  output =
+      at::empty({num_codebooks, num_centroids, vec_len}, centroids.options());
 
   auto stream = at::cuda::getCurrentCUDAStream().stream();
 
@@ -117,7 +124,9 @@ torch::Tensor quant_gemv_v2(
 
         std::cout << "centroid number: " << kNumCentroids
                   << "; vector length: " << kVecLen
-                  << "; smem_size: " << smem_size / 1024 << "KB" << std::endl;
+                  << "; smem_size: " << smem_size / 1024
+                  << "KB; max smem size: " << kMaxSmemPerBlock / 1024 << "KB"
+                  << std::endl;
 
         using Config =
             QuantGemvKeTraits<nv_type, kThreads, kNumCentroids, kVecLen>;
diff --git a/csrc/quant_gemv_v2.cuh b/csrc/quant_gemv_v2.cuh
@@ -3,6 +3,7 @@
 #pragma once
 
 #include "copy/sync.cuh"
+#include "util/convert.cuh"
 #include "util/debug.cuh"
 
 namespace vptq {
@@ -22,10 +23,13 @@ __global__ void quant_gemv_v2_kernel(
   auto* buf = reinterpret_cast<DType*>(buf_);
 
   typename KeTraits::LoaderG2S loader;
+  typename KeTraits::StorerS2G storer;
+
   loader(centroids, buf);
   __copy_async();
   __syncthreads();
 
+  storer(buf, output);
   return;
 }
 
diff --git a/csrc/util/convert.cuh b/csrc/util/convert.cuh
@@ -0,0 +1,32 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#pragma once
+
+#include "config.cuh"
+
+namespace vptq {
+template <typename T>
+T DEVICE from_float(float v, T vv) {
+  (void)(vv);
+  if constexpr (std::is_same<T, __bfloat16>::value) {
+    return vv = __float2bfloat16(v);
+  } else if constexpr (std::is_same<T, float>::value) {
+    return vv = v;
+  } else {
+    static_assert(std::is_same<T, __half>::value);
+    return vv = __float2half(v);
+  }
+}
+
+template <typename T>
+float DEVICE to_float(T v) {
+  if constexpr (std::is_same<T, __bfloat16>::value) {
+    return __bfloat162float(v);
+  } else if constexpr (std::is_same<T, float>::value) {
+    return v;
+  } else {
+    static_assert(std::is_same<T, __half>::value);
+    return __half2float(v);
+  }
+}
+}  // namespace vptq
diff --git a/csrc/util/cuda_utils.cuh b/csrc/util/cuda_utils.cuh
@@ -45,33 +45,8 @@ struct TypeVec2<float> {
   typedef float2 type;
 };
 
-template <typename T>
-T __device__ __forceinline__ ConvertFromFloat(float v, T vv) {
-  (void)(vv);
-  if constexpr (std::is_same<T, __bfloat16>::value) {
-    return vv = __float2bfloat16(v);
-  } else if constexpr (std::is_same<T, float>::value) {
-    return vv = v;
-  } else {
-    static_assert(std::is_same<T, __half>::value);
-    return vv = __float2half(v);
-  }
-}
-
-template <typename T>
-float __device__ __forceinline__ ConvertToFloat(T v) {
-  if constexpr (std::is_same<T, __bfloat16>::value) {
-    return __bfloat162float(v);
-  } else if constexpr (std::is_same<T, float>::value) {
-    return v;
-  } else {
-    static_assert(std::is_same<T, __half>::value);
-    return __half2float(v);
-  }
-}
-
 template <unsigned int WarpSize>
-__device__ __forceinline__ float warpReduceSum(float sum) {
+DEVICE float warpReduceSum(float sum) {
   if constexpr (WarpSize >= 64)
     sum += SHFL_DOWN(sum, 32);  // 0-16, 1-17, 2-18, etc.
   if constexpr (WarpSize >= 32)
@@ -86,8 +61,8 @@ __device__ __forceinline__ float warpReduceSum(float sum) {
 }
 
 template <int GROUPSIZE, typename T>
-__device__ __forceinline__ void ldg_vec_x(
-    T* __restrict__ dst_t32, const uint32_t* __restrict__ src_u32) {
+DEVICE void ldg_vec_x(T* __restrict__ dst_t32,
+                      const uint32_t* __restrict__ src_u32) {
   uint32_t* dst_u32 = (uint32_t*)dst_t32;
   if constexpr (std::is_same<T, float>::value ||
                 std::is_same<T, float2>::value) {
@@ -133,8 +108,7 @@ __device__ __forceinline__ void ldg_vec_x(
 }
 
 template <int WBITS>
-__device__ __forceinline__ uint32_t iterator_packed_tensor(const uint32_t* ptr,
-                                                           int idx) {
+DEVICE uint32_t iterator_packed_tensor(const uint32_t* ptr, int idx) {
   if constexpr (WBITS == 32) {
     return ptr[idx];
   } else if constexpr (WBITS == 16) {
@@ -160,7 +134,7 @@ __device__ __forceinline__ uint32_t iterator_packed_tensor(const uint32_t* ptr,
 }  // namespace cuda
 
 template <typename T>
-T __device__ __forceinline__ FMA2(T a, T b, T c) {
+T DEVICE FMA2(T a, T b, T c) {
   if constexpr (std::is_same<T, __bfloat162>::value) {
 #if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800) && !defined(USE_ROCM)
     float x =
@@ -197,7 +171,7 @@ T __device__ __forceinline__ FMA(T a, T b, T c) {
 }
 
 template <typename T>
-T __device__ __forceinline__ ADD2(T a, T b) {
+T DEVICE ADD2(T a, T b) {
   if constexpr (std::is_same<T, __bfloat162>::value) {
 #if (defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 800) || defined(USE_ROCM)
     float x = __bfloat162float(a.x) + __bfloat162float(b.x);
@@ -215,7 +189,7 @@ T __device__ __forceinline__ ADD2(T a, T b) {
 }
 
 template <typename T>
-T __device__ __forceinline__ ZERO_VALUE(T a) {
+T DEVICE ZERO_VALUE(T a) {
   if constexpr (std::is_same<T, __bfloat16>::value) {
 #if defined(USE_ROCM)
     return __float2bfloat16(0.0f);
diff --git a/run_build.sh b/run_build.sh
diff --git a/vptq/tests/ops/test_quant_gemv.py b/vptq/tests/ops/test_quant_gemv.py