|
41 | 41 | #define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
|
42 | 42 | #define CUDART_HMASK 12000 // CUDA 12.0, min. ver. for half2 -> uint mask comparisons
|
43 | 43 |
|
44 |
| -#define GGML_CUDA_CC_PASCAL 600 |
45 |
| -#define GGML_CUDA_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products |
46 |
| -#define GGML_CUDA_CC_VOLTA 700 |
47 |
| -#define GGML_CUDA_CC_TURING 750 |
48 |
| -#define GGML_CUDA_CC_AMPERE 800 |
49 |
| -#define GGML_CUDA_CC_OFFSET_AMD 0x1000000 |
| 44 | +#define GGML_CUDA_CC_PASCAL 600 |
| 45 | +#define GGML_CUDA_CC_DP4A 610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products |
| 46 | +#define GGML_CUDA_CC_VOLTA 700 |
| 47 | +#define GGML_CUDA_CC_TURING 750 |
| 48 | +#define GGML_CUDA_CC_AMPERE 800 |
| 49 | +#define GGML_CUDA_CC_ADA_LOVELACE 890 |
| 50 | +#define GGML_CUDA_CC_OFFSET_AMD 0x1000000 |
50 | 51 |
|
51 | 52 | // GCN/CNDA, wave size is 64
|
52 | 53 | #define GGML_CUDA_CC_GCN4 (GGML_CUDA_CC_OFFSET_AMD + 0x803) // Tonga, Fiji, Polaris, minimum for fast fp16
|
@@ -199,6 +200,10 @@ typedef float2 dfloat2;
|
199 | 200 | #define NEW_MMA_AVAILABLE
|
200 | 201 | #endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_TURING
|
201 | 202 |
|
| 203 | +#if !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE |
| 204 | +#define CP_ASYNC_AVAILABLE |
| 205 | +#endif // !(defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) && __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE |
| 206 | + |
202 | 207 | #if !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1)
|
203 | 208 | #define FLASH_ATTN_AVAILABLE
|
204 | 209 | #endif // !(defined(GGML_USE_MUSA) && __MUSA_ARCH__ <= GGML_CUDA_CC_QY1)
|
@@ -231,6 +236,10 @@ static bool new_mma_available(const int cc) {
|
231 | 236 | return cc < GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_TURING;
|
232 | 237 | }
|
233 | 238 |
|
| 239 | +static bool cp_async_available(const int cc) { |
| 240 | + return cc < GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_highest_compiled_arch(cc) >= GGML_CUDA_CC_AMPERE; |
| 241 | +} |
| 242 | + |
234 | 243 | static constexpr __device__ int ggml_cuda_get_physical_warp_size() {
|
235 | 244 | #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
236 | 245 | return __AMDGCN_WAVEFRONT_SIZE;
|
|
0 commit comments