microsoft
diff --git a/‎.vscode/settings.json
+2-1 b/‎.vscode/settings.json
+2-1
diff --git a/‎csrc/kernels/copy/copy_traits.cuh
+2-2 b/‎csrc/kernels/copy/copy_traits.cuh
+2-2
diff --git a/‎csrc/kernels/copy/copy.cuh ‎csrc/kernels/copy/global_to_shared.cuh b/‎csrc/kernels/copy/copy.cuh ‎csrc/kernels/copy/global_to_shared.cuh
diff --git a/‎csrc/kernels/copy/mod.cuh
+2-2 b/‎csrc/kernels/copy/mod.cuh
+2-2
diff --git a/‎csrc/kernels/copy/vectorized.cuh
+75 b/‎csrc/kernels/copy/vectorized.cuh
+75
diff --git a/‎csrc/kernels/copy/warp.cuh
+16-2 b/‎csrc/kernels/copy/warp.cuh
+16-2
diff --git a/‎csrc/kernels/decode.cuh
+66 b/‎csrc/kernels/decode.cuh
+66
diff --git a/‎csrc/kernels/copy/layout.cuh ‎csrc/kernels/layout.cuh b/‎csrc/kernels/copy/layout.cuh ‎csrc/kernels/layout.cuh
diff --git a/‎csrc/kernels/quant_gemv_traits.cuh
+39-13 b/‎csrc/kernels/quant_gemv_traits.cuh
+39-13
@@ -3,6 +3,7 @@
         "--style={based_on_s'tyle: google, column_limit: 80, indent_width: 4}"
     ],
     "files.associations": {
-        "optional": "cpp"
+        "optional": "cpp",
+        "atomic": "cpp"
     }
 }
@@ -2,14 +2,14 @@
 // Licensed under the MIT License.
 #pragma once
 
-#include "kernels/copy/layout.cuh"
+#include "kernels/layout.cuh"
 
 namespace vptq::kernels::copy {
 namespace tl = vptq::tile_layout;
 
 template <typename DType>
 struct AccessInfo {
-  // the maximal width of vectorized access.
+  // the maximal width of vectorized access in bits and bytes
   static constexpr int kAccessInBits = 128;
   static constexpr int kAccessInBytes = 16;
 
 
@@ -4,8 +4,8 @@
 
 #include "config.cuh"
 #include "kernels/copy/atom.cuh"
-#include "kernels/copy/copy.cuh"
 #include "kernels/copy/copy_traits.cuh"
-#include "kernels/copy/layout.cuh"
+#include "kernels/copy/global_to_shared.cuh"
 #include "kernels/copy/sync.cuh"
+#include "kernels/copy/vectorized.cuh"
 #include "kernels/copy/warp.cuh"
@@ -0,0 +1,75 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#pragma once
+
+namespace vptq::kernels::copy {
+
+// TODO(ying): Define additional user-defined vectorized types if necessary
+template <typename DType, int kN>
+struct GetPackType;
+
+template <>
+struct GetPackType<__half, 2> {
+  using type = __half2;
+};
+
+template <>
+struct GetPackType<__bfloat16, 2> {
+  using type = __bfloat162;
+};
+
+template <>
+struct GetPackType<uint8_t, 4> {
+  using type = int;
+};
+
+template <>
+struct GetPackType<uint16_t, 2> {
+  using type = int;
+};
+
+template <>
+struct GetPackType<uint16_t, 4> {
+  using type = int2;
+};
+
+template <>
+struct GetPackType<uint, 4> {
+  using type = uint4;  // uint4 has native 128 bits load/store support
+};
+
+template <>
+struct GetPackType<float4, 4> {
+  using type = float4;  // float4 has native 128 bits load/store support
+};
+
+template <typename DType, int kN>
+using PackType = typename GetPackType<DType, kN>::type;
+
+/// Vectorized copy for a single access.
+/// @param DType_ The data type of the elements to copy.
+/// @param kN The number of elements to pack into a vectorized copy. This
+///        should be no more than 128 bits.
+template <typename DType_, int kN>
+struct PackedCopy {
+  using DType = DType_;
+  using Packed = PackType<DType, kN>;
+
+  // the maximum read/write transaction size in bytes for a thread
+  static constexpr int kMaxVecBytes = 16;
+
+  static_assert(sizeof(DType) * kN <= kMaxVecBytes,
+                "The total number of bytes must be less than or equal to the "
+                "maximum width of a vectorized instruction.");
+
+  // This ctor does nothing but ensures the object is created in device memory
+  DEVICE PackedCopy() {}
+
+  DEVICE void operator()(const DType* src_, DType* dst_) {
+    const Packed* src = reinterpret_cast<const Packed*>(src_);
+    Packed* dst = reinterpret_cast<Packed*>(dst_);
+    *dst = *src;
+  }
+};
+
+}  // namespace vptq::kernels::copy
@@ -2,7 +2,7 @@
 // Licensed under the MIT License.
 #pragma once
 
-#include "kernels/copy/layout.cuh"
+#include "kernels/layout.cuh"
 
 namespace vptq::kernels::copy {
 
@@ -15,16 +15,30 @@ struct WarpCounter {
     next_warp_ = kNumWarpsPerTile;
   }
 
+  // TODO(ying): simplify these calculations
   HOST_DEVICE int cur() const { return cur_warp_; }
 
   HOST_DEVICE int next() const { return next_warp_; }
 
-  HOST_DEVICE void operator++() {  // TODO(ying): simplify these calculations
+  HOST_DEVICE int next(int i) const {
+    int wid = next_warp_ + i * kNumWarpsPerTile;
+    wid = wid > kNumWarps ? wid % kNumWarps : wid;
+    return wid;
+  }
+
+  HOST_DEVICE void operator++() {
     cur_warp_ = next_warp_ % kNumWarps;
     next_warp_ += kNumWarpsPerTile;
     next_warp_ = next_warp_ > kNumWarps ? next_warp_ % kNumWarps : next_warp_;
   }
 
+  HOST_DEVICE WarpCounter& operator+=(int n) {
+    cur_warp_ = next_warp_ % kNumWarps;
+    next_warp_ += (n * kNumWarpsPerTile);
+    next_warp_ = next_warp_ > kNumWarps ? next_warp_ % kNumWarps : next_warp_;
+    return *this;
+  }
+
 private:
   int cur_warp_;
   int next_warp_;
 
@@ -0,0 +1,66 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#pragma once
+
+#include "kernels/copy/copy_traits.cuh"
+#include "kernels/copy/vectorized.cuh"
+#include "util/debug.cuh"
+
+namespace vptq::kernels {
+using namespace copy;
+
+template <typename DType_, typename IdType_, typename ResIdType_,
+          const int kNumPerThread_, const int kVecLen_,
+          typename Base = AccessInfo<DType_>>
+struct WeightDecoder {
+  using DType = DType_;
+  using IdType = IdType_;
+  using ResIdType = ResIdType_;
+
+  // TODO(ying): The current implementation requires that the indices for both
+  // main and residual centroids are stored in the same data type, such as both
+  // being uint16_t. If the main indices are in uint16_t and the residual
+  // indices are in uint8_t, additional handling will be required. This will be
+  // addressed in the next version.
+  static_assert(std::is_same_v<IdType, ResIdType>,
+                "The data type of indices for main and residual centroids must "
+                "be the same.");
+
+  static constexpr int kNumPerThread = kNumPerThread_;
+  static constexpr int kVecLen = kVecLen_;
+
+  DEVICE void operator()(DType* output,          // output
+                         const DType* codebook,  // codebook for main centroids
+                         const DType* codebook_res,  // codebook for residual
+                         const IdType* ids,  // indices for main centroids
+                         const ResIdType* res_ids,  // indices for residual
+                         const DType* alpha, const DType* beta) {
+    // threads in a CTA are laid out in 1-D fashion.
+    int offset = threadIdx.x * kNumPerThread;
+    const IdType* ids_ = ids + offset;  // indices for the current thread
+    // residual indices for the current thread
+    const ResIdType* res_ids_ = res_ids + offset;
+
+    // load indices and residual indice into registers
+    // indices on thread local registers
+    IdType reg_ids[kNumPerThread];
+    ResIdType reg_residual_ids[kNumPerThread];
+
+#pragma unroll
+    for (int i = 0; i < kNumPerThread; i += kPackedNum) {
+      copy_ids(&ids_[i] /*src*/, &reg_ids[i] /*dst*/);
+      copy_ids(&res_ids_[i] /*src*/, &reg_residual_ids[i] /*dst*/);
+    }
+  }
+
+private:
+  // Indices are packed into 4 bytes in the current implementation, stored in a
+  // shared memory bank. This can be tuned if needed.
+  static constexpr int kPackedIdsBytes = 4;
+  static constexpr int kPackedNum = kPackedIdsBytes / sizeof(IdType);
+  static_assert(kPackedNum, "kPackedNum must be greater than 0");
+  using VecCopy = PackedCopy<IdType, kPackedNum>;
+  VecCopy copy_ids;
+};
+
+}  // namespace vptq::kernels
@@ -2,7 +2,8 @@
 // Licensed under the MIT License.
 #pragma once
 
-#include "copy/mod.cuh"
+#include "kernels/copy/mod.cuh"
+#include "kernels/decode.cuh"
 
 #include <cute/tensor.hpp>
 
@@ -16,8 +17,9 @@ namespace {
 template <const int a, const int b>
 static constexpr int divup = (a + b - 1) / b;
 
-template <typename DType, const int kTileSize, const int kVecLen,
-          const int kNumCentroids, const int kNumResCentroids>
+template <typename DType, typename IdType, typename ResIdType,
+          const int kTileSize, const int kVecLen, const int kNumCentroids,
+          const int kNumResCentroids>
 struct SharedStorageImpl {
   ///==== Shared memory for inputs ====///
   static constexpr int kSizeCodebook = kNumCentroids * kVecLen;
@@ -29,8 +31,11 @@ struct SharedStorageImpl {
   static constexpr int kSizeInputs = 3 * kTileSize;
   array_aligned<DType, kSizeInputs, 128> inputs;
 
-  static constexpr int kSizeIndices = kTileSize * 2;
-  array_aligned<uint16_t, kTileSize * 2> indices;
+  // TODO(ying): Support residual indices are stored in uint8_t
+  static_assert(std::is_same_v<IdType, ResIdType>,
+                "The data type of indices for main and residual centroids must "
+                "be the same.");
+  array_aligned<IdType, kTileSize * 2> indices;
 
   ///==== Shared mempory for intermediate results ====///
   static constexpr int kSizeWeights = kTileSize * kVecLen;
@@ -42,7 +47,7 @@ struct SharedStorageImpl {
   static constexpr int kSmemSize = ((kSizeCodebook + kSizeCodebookRes +
                                      kSizeInputs + kSizeWeights + kSizeOut) *
                                     sizeof(DType)) +
-                                   kSizeIndices * sizeof(uint16_t);
+                                   2 * kTileSize * sizeof(IdType);
 };
 
 template <typename DType, const int kThreads, const int kNumCentroids,
@@ -83,7 +88,8 @@ struct CodebookTraits : public Base {
 
 }  // namespace
 
-template <typename DType, const int kThreads,        //
+template <typename DType, typename IdType, typename ResIdType,
+          const int kThreads,                        //
           const int kTileSize_, const int kVecLen_,  //
           const int kNumCentroids_, const int kNumResCentroids_,
           typename Base = copy::AccessInfo<DType>>
@@ -95,8 +101,9 @@ struct QuantGemvKeTraits : public Base {
   static constexpr int kTileSize = kTileSize_;
 
   /// allocate shared memory
-  using SharedStorage = SharedStorageImpl<DType, kTileSize, kVecLen,
-                                          kNumCentroids, kNumResCentroids>;
+  using SharedStorage =
+      SharedStorageImpl<DType, IdType, ResIdType, kTileSize, kVecLen,
+                        kNumCentroids, kNumResCentroids>;
   /// configurations for loading codebooks
   using MainCentroidTraits =
       CodebookTraits<DType, kThreads, kNumCentroids, kVecLen>;
@@ -131,14 +138,33 @@ struct QuantGemvKeTraits : public Base {
 
   /// configurations for loading tiled indices
   static constexpr int kThreadsIndex =
-      kTileSize * sizeof(uint16_t) / Base::kAccessInBytes;
+      kTileSize * sizeof(IdType) / Base::kAccessInBytes;
   static_assert(kThreadsIndex <= kThreads,
                 "The current implementation requires that the number of "
                 "threads used to load a single index tile must be less than or "
                 "equal to the number of threads in the block.");
-  using IndexLoader = copy::GlobalToSharedInputLoader<uint16_t, kTileSize>;
-  // storer is defined for debugging purposes
-  using IndexStorer = copy::SharedToGlobalInputStorer<uint16_t, kTileSize>;
+
+  // TODO(ying): The current implementation requires that the indices for both
+  // main and residual centroids are stored in the same data type. This will be
+  // addressed in the next version.
+  static_assert(std::is_same_v<IdType, ResIdType>,
+                "The data type of indices for main and residual centroids must "
+                "be the same.");
+  using IndexLoader = copy::GlobalToSharedInputLoader<IdType, 2 * kTileSize>;
+  using IndexStorer = copy::SharedToGlobalInputStorer<IdType, 2 * kTileSize>;
+
+  /// configurations for decoding indices
+  // Ensure the indices can be stored aligned with shared memory banks, and a
+  // single thread decode at least `kIdsPerBank` indices.
+  static constexpr int kBankBytes = 4;
+  static_assert(kBankBytes % sizeof(ResIdType) == 0);
+  static constexpr int kIdsPerBank = kBankBytes / sizeof(ResIdType);
+  // how many indices are decoded by a single thread
+  static_assert(kTileSize % (kThreads * kIdsPerBank) == 0);
+  static constexpr int kDecodeNumPerThread = kTileSize / kThreads;
+
+  using Decoder =
+      WeightDecoder<DType, IdType, ResIdType, kDecodeNumPerThread, kVecLen>;
 };
 
 }  // namespace vptq::kernels
Original file line number	Diff line number	Diff line change
`@@ -3,6 +3,7 @@`
`3`	`3`	`"--style={based_on_s'tyle: google, column_limit: 80, indent_width: 4}"`
`4`	`4`	`],`
`5`	`5`	`"files.associations": {`
`6`		`- "optional": "cpp"`
	`6`	`+ "optional": "cpp",`
	`7`	`+ "atomic": "cpp"`
`7`	`8`	`}`
`8`	`9`	`}`