microsoft
diff --git a/‎.vscode/settings.json
+1-3 b/‎.vscode/settings.json
+1-3
diff --git a/‎csrc/kernels/copy/atom.cuh
+144 b/‎csrc/kernels/copy/atom.cuh
+144
diff --git a/‎csrc/kernels/copy/copy.cuh
+41-136 b/‎csrc/kernels/copy/copy.cuh
+41-136
diff --git a/‎csrc/kernels/copy/copy_traits.cuh
+3 b/‎csrc/kernels/copy/copy_traits.cuh
+3
diff --git a/‎csrc/kernels/copy/layout.cuh
+35 b/‎csrc/kernels/copy/layout.cuh
+35
@@ -4,8 +4,6 @@
     ],
     "files.associations": {
         "optional": "cpp",
-        "system_error": "cpp",
-        "array": "cpp",
-        "string": "cpp"
+        "cstdint": "cpp"
     }
 }
@@ -0,0 +1,144 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#pragma once
+
+#include <stdint.h>
+
+namespace vptq::kernels::copy {
+
+namespace {
+/// ld.shared
+template <const int kBytes>
+DEVICE void ld_shared(void* dst, uint32_t src);
+
+/// ld.shared - 16b
+template <>
+DEVICE void ld_shared<2>(void* dst, uint32_t src) {
+  asm volatile("ld.shared.u16 %0, [%1];\n"
+               : "=h"(*reinterpret_cast<uint16_t*>(dst))
+               : "r"(src));
+}
+
+/// ld.shared - 32b
+template <>
+DEVICE void ld_shared<4>(void* dst, uint32_t src) {
+  asm volatile("ld.shared.u32 %0, [%1];\n"
+               : "=r"(*reinterpret_cast<uint32_t*>(dst))
+               : "r"(src));
+}
+
+/// ld.shared - 64b
+template <>
+DEVICE void ld_shared<8>(void* dst, uint32_t src) {
+  uint2* dst_u64 = reinterpret_cast<uint2*>(dst);
+  asm volatile("ld.shared.v2.u32 {%0, %1}, [%2];\n"
+               : "=r"(dst_u64->x), "=r"(dst_u64->y)
+               : "r"(src));
+}
+
+/// ld.shared - 128b
+template <>
+DEVICE void ld_shared<16>(void* dst, uint32_t src) {
+  uint4* dst_u128 = reinterpret_cast<uint4*>(dst);
+  asm volatile("ld.shared.v4.u32 {%0, %1, %2, %3}, [%4];\n"
+               : "=r"(dst_u128->x), "=r"(dst_u128->y), "=r"(dst_u128->z),
+                 "=r"(dst_u128->w)
+               : "r"(src));
+}
+
+/// st.shared
+template <int kBytes>
+DEVICE void st_shared(uint32_t dst, void const* src);
+
+/// st.shared - 16b
+template <>
+DEVICE void st_shared<2>(uint32_t dst, void const* src) {
+  asm volatile("st.shared.u16 [%0], %1;\n"
+               :
+               : "r"(dst), "h"(*reinterpret_cast<uint16_t const*>(src)));
+}
+
+/// st.shared - 32b
+template <>
+DEVICE void st_shared<4>(uint32_t dst, void const* src) {
+  asm volatile("st.shared.u32 [%0], %1;\n"
+               :
+               : "r"(dst), "r"(*reinterpret_cast<uint32_t const*>(src)));
+}
+
+/// st.shared - 64b
+template <>
+DEVICE void st_shared<8>(uint32_t dst, void const* src) {
+  uint2 const* dst_u64 = reinterpret_cast<uint2 const*>(src);
+  asm volatile("st.shared.v2.u32 [%0], {%1, %2};\n"
+               :
+               : "r"(dst), "r"(dst_u64->x), "r"(dst_u64->y));
+}
+
+/// st.shared - 128b
+template <>
+DEVICE void st_shared<16>(uint32_t dst, void const* src) {
+  uint4 const* dst_u128 = reinterpret_cast<uint4 const*>(src);
+  asm volatile("st.shared.v4.u32 [%0], {%1, %2, %3, %4};\n"
+               :
+               : "r"(dst), "r"(dst_u128->x), "r"(dst_u128->y), "r"(dst_u128->z),
+                 "r"(dst_u128->w));
+}
+
+/// st.global
+template <int kBytes>
+DEVICE void st_global(void* dst, const void* src);
+
+template <>
+DEVICE void st_global<16>(void* dst, const void* src) {
+  uint4 const* dst_u128 = reinterpret_cast<uint4 const*>(src);
+  asm volatile("st.global.v4.b32 [%0], {%1, %2, %3, %4};\n"
+               :
+               : "l"(dst), "r"(dst_u128->x), "r"(dst_u128->y), "r"(dst_u128->z),
+                 "r"(dst_u128->w));
+}
+}  // namespace
+
+template <int kBytes>
+DEVICE void ld_shared_st_global(void* dst, uint32_t src);
+
+template <>
+DEVICE void ld_shared_st_global<16>(void* dst, uint32_t src) {
+  unsigned tmp[4];
+  ld_shared<16>(tmp, src);
+  st_global<16>(dst, tmp);
+}
+
+template <const int kBytes>
+DEVICE void ld_global_st_shared(uint32_t dst, void const* src) {
+  static_assert(kBytes == 4 || kBytes == 8 || kBytes == 16);
+
+#if (__CUDA_ARCH__ >= 800)
+  // SM90, hopper, SM80, SM86, ampere
+
+  // TODO(ying): add a wrapper to allow choosing between different caching
+  // policies (e.g. "cache all levels").
+  asm volatile("cp.async.cg.shared.global [%0], [%1], %2;\n" ::"r"(dst),
+               "l"(src), "n"(kBytes));
+#else
+  unsigned tmp[kBytes / 4];
+  if constexpr (kBytes == 16) {
+    asm volatile("ld.global.v4.b32 {%0, %1, %2, %3}, [%4];\n"
+                 : "=r"(tmp[0]), "=r"(tmp[1]), "=r"(tmp[2]), "=r"(tmp[3])
+                 : "l"(src));
+    asm volatile("st.shared.v4.b32 [%0], {%1, %2, %3, %4};\n" ::"r"(dst),
+                 "r"(tmp[0]), "r"(tmp[1]), "r"(tmp[2]), "r"(tmp[3]));
+  } else if constexpr (kBytes == 8) {
+    asm volatile("ld.global.v2.b32 {%0, %1}, [%2];\n"
+                 : "=r"(tmp[0]), "=r"(tmp[1])
+                 : "l"(src));
+    asm volatile("st.shared.v2.b32 [%0], {%1, %2};\n" ::"r"(dst), "r"(tmp[0]),
+                 "r"(tmp[1]));
+  } else if constexpr (kBytes == 4) {
+    asm volatile("ld.global.b32 %0, [%1];\n" : "=r"(tmp[0]) : "l"(src));
+    asm volatile("st.shared.b32 [%0], %1;\n" ::"r"(dst), "r"(tmp[0]));
+  }
+#endif
+}
+
+}  // namespace vptq::kernels::copy
@@ -2,149 +2,19 @@
 // Licensed under the MIT License.
 #pragma once
 
+/// The Loader and Storer in this file use all collaborative threads in a thread
+/// block to transfer data tiles between global memory and shared memory.
+
+#include "kernels/copy/atom.cuh"
 #include "kernels/copy/copy_traits.cuh"
+#include "kernels/copy/warp.cuh"
 
 #include <cute/tensor.hpp>
 
 namespace vptq::kernels::copy {
-
+namespace tl = vptq::tile_layout;
 using namespace cute;
 
-namespace {
-/// ld.shared
-template <const int kBytes>
-DEVICE void ld_shared(void* dst, uint32_t src);
-
-/// ld.shared - 16b
-template <>
-DEVICE void ld_shared<2>(void* dst, uint32_t src) {
-  asm volatile("ld.shared.u16 %0, [%1];\n"
-               : "=h"(*reinterpret_cast<uint16_t*>(dst))
-               : "r"(src));
-}
-
-/// ld.shared - 32b
-template <>
-DEVICE void ld_shared<4>(void* dst, uint32_t src) {
-  asm volatile("ld.shared.u32 %0, [%1];\n"
-               : "=r"(*reinterpret_cast<uint32_t*>(dst))
-               : "r"(src));
-}
-
-/// ld.shared - 64b
-template <>
-DEVICE void ld_shared<8>(void* dst, uint32_t src) {
-  uint2* dst_u64 = reinterpret_cast<uint2*>(dst);
-  asm volatile("ld.shared.v2.u32 {%0, %1}, [%2];\n"
-               : "=r"(dst_u64->x), "=r"(dst_u64->y)
-               : "r"(src));
-}
-
-/// ld.shared - 128b
-template <>
-DEVICE void ld_shared<16>(void* dst, uint32_t src) {
-  uint4* dst_u128 = reinterpret_cast<uint4*>(dst);
-  asm volatile("ld.shared.v4.u32 {%0, %1, %2, %3}, [%4];\n"
-               : "=r"(dst_u128->x), "=r"(dst_u128->y), "=r"(dst_u128->z),
-                 "=r"(dst_u128->w)
-               : "r"(src));
-}
-
-/// st.shared
-template <int kBytes>
-DEVICE void st_shared(uint32_t dst, void const* src);
-
-/// st.shared - 16b
-template <>
-DEVICE void st_shared<2>(uint32_t dst, void const* src) {
-  asm volatile("st.shared.u16 [%0], %1;\n"
-               :
-               : "r"(dst), "h"(*reinterpret_cast<uint16_t const*>(src)));
-}
-
-/// st.shared - 32b
-template <>
-DEVICE void st_shared<4>(uint32_t dst, void const* src) {
-  asm volatile("st.shared.u32 [%0], %1;\n"
-               :
-               : "r"(dst), "r"(*reinterpret_cast<uint32_t const*>(src)));
-}
-
-/// st.shared - 64b
-template <>
-DEVICE void st_shared<8>(uint32_t dst, void const* src) {
-  uint2 const* dst_u64 = reinterpret_cast<uint2 const*>(src);
-  asm volatile("st.shared.v2.u32 [%0], {%1, %2};\n"
-               :
-               : "r"(dst), "r"(dst_u64->x), "r"(dst_u64->y));
-}
-
-/// st.shared - 128b
-template <>
-DEVICE void st_shared<16>(uint32_t dst, void const* src) {
-  uint4 const* dst_u128 = reinterpret_cast<uint4 const*>(src);
-  asm volatile("st.shared.v4.u32 [%0], {%1, %2, %3, %4};\n"
-               :
-               : "r"(dst), "r"(dst_u128->x), "r"(dst_u128->y), "r"(dst_u128->z),
-                 "r"(dst_u128->w));
-}
-
-/// st.global
-template <int kBytes>
-DEVICE void st_global(void* dst, const void* src);
-
-template <>
-DEVICE void st_global<16>(void* dst, const void* src) {
-  uint4 const* dst_u128 = reinterpret_cast<uint4 const*>(src);
-  asm volatile("st.global.v4.b32 [%0], {%1, %2, %3, %4};\n"
-               :
-               : "l"(dst), "r"(dst_u128->x), "r"(dst_u128->y), "r"(dst_u128->z),
-                 "r"(dst_u128->w));
-}
-}  // namespace
-
-template <int kBytes>
-DEVICE void ld_shared_st_global(void* dst, uint32_t src);
-
-template <>
-DEVICE void ld_shared_st_global<16>(void* dst, uint32_t src) {
-  unsigned tmp[4];
-  ld_shared<16>(tmp, src);
-  st_global<16>(dst, tmp);
-}
-
-template <const int kBytes>
-DEVICE void ld_global_st_shared(uint32_t dst, void const* src) {
-  static_assert(kBytes == 4 || kBytes == 8 || kBytes == 16);
-
-#if (__CUDA_ARCH__ >= 800)
-  // SM90, hopper, SM80, SM86, ampere
-
-  // TODO(ying): add a wrapper to allow choosing between different caching
-  // policies (e.g. "cache all levels").
-  asm volatile("cp.async.cg.shared.global [%0], [%1], %2;\n" ::"r"(dst),
-               "l"(src), "n"(kBytes));
-#else
-  unsigned tmp[kBytes / 4];
-  if constexpr (kBytes == 16) {
-    asm volatile("ld.global.v4.b32 {%0, %1, %2, %3}, [%4];\n"
-                 : "=r"(tmp[0]), "=r"(tmp[1]), "=r"(tmp[2]), "=r"(tmp[3])
-                 : "l"(src));
-    asm volatile("st.shared.v4.b32 [%0], {%1, %2, %3, %4};\n" ::"r"(dst),
-                 "r"(tmp[0]), "r"(tmp[1]), "r"(tmp[2]), "r"(tmp[3]));
-  } else if constexpr (kBytes == 8) {
-    asm volatile("ld.global.v2.b32 {%0, %1}, [%2];\n"
-                 : "=r"(tmp[0]), "=r"(tmp[1])
-                 : "l"(src));
-    asm volatile("st.shared.v2.b32 [%0], {%1, %2};\n" ::"r"(dst), "r"(tmp[0]),
-                 "r"(tmp[1]));
-  } else if constexpr (kBytes == 4) {
-    asm volatile("ld.global.b32 %0, [%1];\n" : "=r"(tmp[0]) : "l"(src));
-    asm volatile("st.shared.b32 [%0], %1;\n" ::"r"(dst), "r"(tmp[0]));
-  }
-#endif
-}
-
 template <typename DType, const int kNumPerAccess, typename ThreadLayout,
           typename GlobalLayout /*src*/, typename SharedLayout /*dst*/>
 struct GlobalToSharedLoader {
@@ -213,4 +83,39 @@ private:
   TiledCopy tiled_copy_;
 };
 
+/// NOTE: This configuration is specialized for copying a small tile whose size
+/// is smaller than the data size accessed by all threads in a CTA concurrently.
+template <typename DType, const int kNumel, typename Base = AccessInfo<DType>>
+struct GlobalToSharedInputLoader : public Base {
+  static constexpr int kWarpTileShape = Base::kNumPerAccess * WARP_SIZE;
+  static constexpr int kThreads = kNumel / kWarpTileShape * WARP_SIZE;
+
+  DEVICE void operator()(const DType* src_, DType* dst_, int start_warp = 0) {
+    int warp_id = threadIdx.x / WARP_SIZE - start_warp;
+    int lane_id = threadIdx.x % WARP_SIZE;
+    int offset = warp_id * kWarpTileShape + lane_id * Base::kNumPerAccess;
+
+    ld_global_st_shared<Base::kAccessInBytes>(
+        __cvta_generic_to_shared(dst_ + offset), src_ + offset);
+  }
+};
+
+/// NOTE: This configuration is specialized for copying a small tile whose size
+/// is smaller than the data size accessed by all threads in a CTA concurrently.
+template <typename DType, const int kNumel, typename Base = AccessInfo<DType>>
+struct SharedToGlobalInputStorer : public Base {
+  static constexpr int kWarpTileShape = Base::kNumPerAccess * WARP_SIZE;
+  static constexpr int kThreads = kNumel / kWarpTileShape * WARP_SIZE;
+
+  DEVICE void operator()(const DType* src_, DType* dst_, int start_warp = 0) {
+    int warp_id = threadIdx.x / WARP_SIZE - start_warp;
+    int lane_id = threadIdx.x % WARP_SIZE;
+    int offset = warp_id * kWarpTileShape + lane_id * Base::kNumPerAccess;
+
+    ld_shared_st_global<Base::kAccessInBytes>(
+        dst_ + offset,
+        static_cast<uint32_t>(__cvta_generic_to_shared(src_ + offset)));
+  }
+};
+
 }  // namespace vptq::kernels::copy
@@ -2,7 +2,10 @@
 // Licensed under the MIT License.
 #pragma once
 
+#include "kernels/copy/layout.cuh"
+
 namespace vptq::kernels::copy {
+namespace tl = vptq::tile_layout;
 
 template <typename DType>
 struct AccessInfo {
 
@@ -0,0 +1,35 @@
+// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+#pragma once
+
+namespace vptq::tile_layout {
+
+enum class Layout { kRowMajor = 0, kColMajor = 1 };
+
+template <const int kRows_, const int kCols_, const int kRowStride_,
+          const int kColStride_>
+struct MatrixLayout {
+  static constexpr int kRows = kRows_;
+  static constexpr int kCols = kCols_;
+
+  static constexpr int kRowStride = kRowStride_;
+  static constexpr int kColStride = kColStride_;
+
+  static constexpr int kNumel = kRows * kCols;
+
+  // FIXME(ying): The current method to determine if the layout is row-major or
+  // column-major may not be accurate for a matrix of shape (1, 1).
+  static constexpr Layout kType =
+      kColStride == 1 ? Layout::kRowMajor : Layout::kColMajor;
+
+  HOST_DEVICE int operator()(int i, int j) const {
+    return i * kRowStride + j * kColStride;
+  }
+};
+
+template <const int kRow, const int kCol, const int kStride = kCol>
+using RowMajor = MatrixLayout<kRow, kCol, kStride, 1>;
+template <const int kRow, const int kCol, const int kStride = kRow>
+using ColMajor = MatrixLayout<kRow, kCol, 1, kStride>;
+
+}  // namespace vptq::tile_layout
Original file line number	Diff line number	Diff line change
`@@ -4,8 +4,6 @@`
`4`	`4`	`],`
`5`	`5`	`"files.associations": {`
`6`	`6`	`"optional": "cpp",`
`7`		`- "system_error": "cpp",`
`8`		`- "array": "cpp",`
`9`		`- "string": "cpp"`
	`7`	`+ "cstdint": "cpp"`
`10`	`8`	`}`
`11`	`9`	`}`