diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/mnnvlTwoShotAllreduceKernels.cu b/cpp/tensorrt_llm/kernels/communicationKernels/mnnvlTwoShotAllreduceKernels.cu index c38abd95785..1cb65e6910a 100644 --- a/cpp/tensorrt_llm/kernels/communicationKernels/mnnvlTwoShotAllreduceKernels.cu +++ b/cpp/tensorrt_llm/kernels/communicationKernels/mnnvlTwoShotAllreduceKernels.cu @@ -396,6 +396,7 @@ __inline__ __device__ T warpReduceSum(T val) return val; } +#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) inline __device__ float block_reduce_sum(float val) { __shared__ float smem[WARP_SIZE]; @@ -426,6 +427,7 @@ __device__ float4 loadfloat4(void const* ptr) return return_value; } +#endif } // namespace template diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_tma_utils.cuh b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_tma_utils.cuh index 18911feb7c4..06cff88ad61 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_tma_utils.cuh +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_tma_utils.cuh @@ -89,12 +89,8 @@ PFN_cuTensorMapEncodeTiled_v12000 get_cuTensorMapEncodeTiled() // Get pointer to cuTensorMapEncodeTiled cudaDriverEntryPointQueryResult driver_status; void* cuTensorMapEncodeTiled_ptr = nullptr; -#if (__CUDACC_VER_MAJOR__ >= 12 && __CUDACC_VER_MINOR__ >= 5) cudaGetDriverEntryPointByVersion( "cuTensorMapEncodeTiled", &cuTensorMapEncodeTiled_ptr, 12000, cudaEnableDefault, &driver_status); -#else - cudaGetDriverEntryPoint("cuTensorMapEncodeTiled", &cuTensorMapEncodeTiled_ptr, cudaEnableDefault, &driver_status); -#endif if (driver_status != cudaDriverEntryPointSuccess) { diff --git a/cpp/tensorrt_llm/kernels/recoverFromRingAtten.cu b/cpp/tensorrt_llm/kernels/recoverFromRingAtten.cu index d4bafb3db61..050f99efdac 100644 --- a/cpp/tensorrt_llm/kernels/recoverFromRingAtten.cu +++ b/cpp/tensorrt_llm/kernels/recoverFromRingAtten.cu @@ -53,6 +53,10 @@ __global__ void reduce4ring_attention( float* softmax_sum = softmax_stats + 1; float* max = softmax_stats; +#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__ +#pragma nv_diag_suppress static_var_with_dynamic_init +// https://nvidia.github.io/cccl/libcudacxx/extended_api/synchronization_primitives/barrier.html +#endif __shared__ cuda::barrier barrier; if (block.thread_rank() == 0) { @@ -113,11 +117,6 @@ template void invokeRecoverFromRA(Tout* accu_output, float* accu_softmax_stats, Tout* output, float* softmax_stats, int b, int s, int h, int d, int* cu_seqlens, cudaStream_t stream) { - float* accu_softmax_sum = accu_softmax_stats; - float* accu_softmax_max = accu_softmax_stats + b * s * h; - float* softmax_sum = softmax_stats; - float* softmax_max = softmax_stats + b * s * h; - int threads_per_block = 128; int saturated_s_block_dim = 3000 / b + 1; s = s * h; diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/KernelRunner.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/KernelRunner.cpp index ab52e181c45..726a2aea7ea 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/KernelRunner.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/gemm/KernelRunner.cpp @@ -16,13 +16,16 @@ #include +// clang-format off +#include "trtllmGen_gemm_export/GemmInterface.h" +#include "trtllmGen_gemm_export/GemmOptions.h" +#include "trtllmGen_gemm_export/trtllm/gen/DtypeDecl.h" +// clang-format on + #include "KernelRunner.h" #include "tensorrt_llm/common/assert.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/envUtils.h" -#include "trtllmGen_gemm_export/GemmInterface.h" -#include "trtllmGen_gemm_export/GemmOptions.h" -#include "trtllmGen_gemm_export/trtllm/gen/DtypeDecl.h" namespace tensorrt_llm {