Skip to content

Commit d9c1dd6

Browse files
committed
comments
1 parent 94fb4ef commit d9c1dd6

File tree

1 file changed

+3
-2
lines changed
  • libflashinfer/include/flashinfer/attention/generic

1 file changed

+3
-2
lines changed

libflashinfer/include/flashinfer/attention/generic/decode.cuh

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -645,7 +645,8 @@ gpuError_t SingleDecodeWithKVCacheDispatched(Params params, typename Params::DTy
645645
const uint32_t num_kv_heads = params.num_kv_heads;
646646
const uint32_t seq_len = params.kv_len;
647647

648-
// AMD CDNA3 optimized vector size - prefer smaller vec_size for better occupancy
648+
// Optimizing vec_size for CDNA3 architecture.
649+
// This helps keep the dynamic shared memory allocation within hardware threshold for CDNA3
649650
constexpr uint32_t vec_size = (HEAD_DIM < 256U)
650651
? std::max(8UL / sizeof(DTypeKV), HEAD_DIM / 64UL)
651652
: std::max(8UL / sizeof(DTypeKV), HEAD_DIM / 32UL);
@@ -660,7 +661,7 @@ gpuError_t SingleDecodeWithKVCacheDispatched(Params params, typename Params::DTy
660661
std::max(get_heuristic_num_threads(GROUP_SIZE, sizeof(DTypeKV)), bdx * bdy);
661662
constexpr uint32_t bdz = num_threads / (bdx * bdy);
662663

663-
// AMD CDNA3 Reduce tile size to minimize shared memory usage
664+
// AMD CDNA3 Reduce tile size to accomodate for CDNA3 architecture's hardware threshold.
664665
constexpr uint32_t tile_size_per_bdx = (GROUP_SIZE == 1U) ? 2U : 1U;
665666

666667
// This has been hard coded to 2U. Previous implementation involved a macro redirection that

0 commit comments

Comments
 (0)