Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -396,6 +396,7 @@ __inline__ __device__ T warpReduceSum(T val)
return val;
}

#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
inline __device__ float block_reduce_sum(float val)
{
__shared__ float smem[WARP_SIZE];
Expand Down Expand Up @@ -426,6 +427,7 @@ __device__ float4 loadfloat4(void const* ptr)

return return_value;
}
#endif
} // namespace

template <int DIM, int NUM_THREADS, int NUM_INPUTS, typename T_OUT, typename T_IN>
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -89,12 +89,8 @@ PFN_cuTensorMapEncodeTiled_v12000 get_cuTensorMapEncodeTiled()
// Get pointer to cuTensorMapEncodeTiled
cudaDriverEntryPointQueryResult driver_status;
void* cuTensorMapEncodeTiled_ptr = nullptr;
#if (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 5)
cudaGetDriverEntryPointByVersion(
"cuTensorMapEncodeTiled", &cuTensorMapEncodeTiled_ptr, 12000, cudaEnableDefault, &driver_status);
#else
cudaGetDriverEntryPoint("cuTensorMapEncodeTiled", &cuTensorMapEncodeTiled_ptr, cudaEnableDefault, &driver_status);
#endif

if (driver_status != cudaDriverEntryPointSuccess)
{
Expand Down
9 changes: 4 additions & 5 deletions cpp/tensorrt_llm/kernels/recoverFromRingAtten.cu
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,10 @@ __global__ void reduce4ring_attention(
float* softmax_sum = softmax_stats + 1;
float* max = softmax_stats;

#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
#pragma nv_diag_suppress static_var_with_dynamic_init
// https://nvidia.github.io/cccl/libcudacxx/extended_api/synchronization_primitives/barrier.html
#endif
__shared__ cuda::barrier<cuda::thread_scope::thread_scope_block> barrier;
if (block.thread_rank() == 0)
{
Expand Down Expand Up @@ -113,11 +117,6 @@ template <typename Tout>
void invokeRecoverFromRA(Tout* accu_output, float* accu_softmax_stats, Tout* output, float* softmax_stats, int b, int s,
int h, int d, int* cu_seqlens, cudaStream_t stream)
{
float* accu_softmax_sum = accu_softmax_stats;
float* accu_softmax_max = accu_softmax_stats + b * s * h;
float* softmax_sum = softmax_stats;
float* softmax_max = softmax_stats + b * s * h;

int threads_per_block = 128;
int saturated_s_block_dim = 3000 / b + 1;
s = s * h;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,16 @@

#include <vector>

// clang-format off
#include "trtllmGen_gemm_export/GemmInterface.h"
#include "trtllmGen_gemm_export/GemmOptions.h"
#include "trtllmGen_gemm_export/trtllm/gen/DtypeDecl.h"
// clang-format on

#include "KernelRunner.h"
#include "tensorrt_llm/common/assert.h"
#include "tensorrt_llm/common/cudaUtils.h"
#include "tensorrt_llm/common/envUtils.h"
#include "trtllmGen_gemm_export/GemmInterface.h"
#include "trtllmGen_gemm_export/GemmOptions.h"
#include "trtllmGen_gemm_export/trtllm/gen/DtypeDecl.h"

namespace tensorrt_llm
{
Expand Down
Loading