File tree Expand file tree Collapse file tree 1 file changed +3
-2
lines changed
libflashinfer/include/flashinfer/attention/generic Expand file tree Collapse file tree 1 file changed +3
-2
lines changed Original file line number Diff line number Diff line change @@ -645,7 +645,8 @@ gpuError_t SingleDecodeWithKVCacheDispatched(Params params, typename Params::DTy
645645 const uint32_t num_kv_heads = params.num_kv_heads ;
646646 const uint32_t seq_len = params.kv_len ;
647647
648- // AMD CDNA3 optimized vector size - prefer smaller vec_size for better occupancy
648+ // Optimizing vec_size for CDNA3 architecture.
649+ // This helps keep the dynamic shared memory allocation within hardware threshold for CDNA3
649650 constexpr uint32_t vec_size = (HEAD_DIM < 256U )
650651 ? std::max (8UL / sizeof (DTypeKV), HEAD_DIM / 64UL )
651652 : std::max (8UL / sizeof (DTypeKV), HEAD_DIM / 32UL );
@@ -660,7 +661,7 @@ gpuError_t SingleDecodeWithKVCacheDispatched(Params params, typename Params::DTy
660661 std::max (get_heuristic_num_threads (GROUP_SIZE, sizeof (DTypeKV)), bdx * bdy);
661662 constexpr uint32_t bdz = num_threads / (bdx * bdy);
662663
663- // AMD CDNA3 Reduce tile size to minimize shared memory usage
664+ // AMD CDNA3 Reduce tile size to accomodate for CDNA3 architecture's hardware threshold.
664665 constexpr uint32_t tile_size_per_bdx = (GROUP_SIZE == 1U ) ? 2U : 1U ;
665666
666667 // This has been hard coded to 2U. Previous implementation involved a macro redirection that
You can’t perform that action at this time.
0 commit comments