[CUDA] Implement urKernelSuggestMaxCooperativeGroupCountExp for Cuda

oneapi-src · Jul 4, 2024 · c612317 · c612317
1 parent 4030080
commit c612317
Show file tree

Hide file tree

Showing 3 changed files with 28 additions and 10 deletions.
diff --git a/source/adapters/cuda/device.cpp b/source/adapters/cuda/device.cpp
@@ -57,12 +57,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
     return ReturnValue(4318u);
   }
   case UR_DEVICE_INFO_MAX_COMPUTE_UNITS: {
-    int ComputeUnits = 0;
-    UR_CHECK_ERROR(cuDeviceGetAttribute(
-        &ComputeUnits, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT,
-        hDevice->get()));
-    detail::ur::assertion(ComputeUnits >= 0);
-    return ReturnValue(static_cast<uint32_t>(ComputeUnits));
+    return ReturnValue(hDevice->getNumComputeUnits());
   }
   case UR_DEVICE_INFO_MAX_WORK_ITEM_DIMENSIONS: {
     return ReturnValue(MaxWorkItemDimensions);

diff --git a/source/adapters/cuda/device.hpp b/source/adapters/cuda/device.hpp
@@ -32,6 +32,7 @@ struct ur_device_handle_t_ {
   int MaxCapacityLocalMem{0};
   int MaxChosenLocalMem{0};
   bool MaxLocalMemSizeChosen{false};
+  uint32_t NumComputeUnits{0};
 
 public:
   ur_device_handle_t_(native_type cuDevice, CUcontext cuContext, CUevent evBase,
@@ -54,6 +55,10 @@ struct ur_device_handle_t_ {
                                    sizeof(MaxWorkGroupSize), &MaxWorkGroupSize,
                                    nullptr));
 
+    UR_CHECK_ERROR(cuDeviceGetAttribute(
+        reinterpret_cast<int *>(&NumComputeUnits),
+        CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, cuDevice));
+
     // Set local mem max size if env var is present
     static const char *LocalMemSizePtrUR =
         std::getenv("UR_CUDA_MAX_LOCAL_MEM_SIZE");
@@ -107,6 +112,8 @@ struct ur_device_handle_t_ {
   int getMaxChosenLocalMem() const noexcept { return MaxChosenLocalMem; };
 
   bool maxLocalMemSizeChosen() { return MaxLocalMemSizeChosen; };
+
+  uint32_t getNumComputeUnits() const noexcept { return NumComputeUnits; };
 };
 
 int getAttribute(ur_device_handle_t Device, CUdevice_attribute Attribute);
diff --git a/source/adapters/cuda/kernel.cpp b/source/adapters/cuda/kernel.cpp
@@ -13,6 +13,7 @@
 #include "memory.hpp"
 #include "queue.hpp"
 #include "sampler.hpp"
+#include "ur_api.h"
 
 UR_APIEXPORT ur_result_t UR_APICALL
 urKernelCreate(ur_program_handle_t hProgram, const char *pKernelName,
@@ -167,10 +168,25 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetNativeHandle(
 UR_APIEXPORT ur_result_t UR_APICALL urKernelSuggestMaxCooperativeGroupCountExp(
     ur_kernel_handle_t hKernel, size_t localWorkSize,
     size_t dynamicSharedMemorySize, uint32_t *pGroupCountRet) {
-  (void)hKernel;
-  (void)localWorkSize;
-  (void)dynamicSharedMemorySize;
-  *pGroupCountRet = 1;
+  UR_ASSERT(hKernel, UR_RESULT_ERROR_INVALID_KERNEL);
+
+  // We need to set the active current device for this kernel explicitly here,
+  // because the occupancy querying API does not take device parameter.
+  ur_device_handle_t Device = hKernel->getProgram()->getDevice();
+  ScopedContext Active(Device);
+  try {
+    int MaxNumActiveGroupsPerCU{0};
+    UR_CHECK_ERROR(cuOccupancyMaxActiveBlocksPerMultiprocessor(
+        &MaxNumActiveGroupsPerCU, hKernel->get(), localWorkSize,
+        dynamicSharedMemorySize));
+    detail::ur::assertion(MaxNumActiveGroupsPerCU >= 0);
+
+    // Multiply by the number of SMs (CUs = compute units) on the device in
+    // order to retreive the total number of groups/blocks that can be launched.
+    *pGroupCountRet = Device->getNumComputeUnits() * MaxNumActiveGroupsPerCU;
+  } catch (ur_result_t Err) {
+    return Err;
+  }
   return UR_RESULT_SUCCESS;
 }