From c1460fbd88c912c4e1c1f7987ad683c7c89a5b0e Mon Sep 17 00:00:00 2001 From: Callum Fare Date: Tue, 6 Aug 2024 13:59:29 +0100 Subject: [PATCH 01/11] Add initial Offload adapter implementation --- .../source/adapters/CMakeLists.txt | 5 + .../source/adapters/offload/.clang-format | 4 + .../source/adapters/offload/CMakeLists.txt | 58 +++ .../source/adapters/offload/adapter.cpp | 96 ++++ .../source/adapters/offload/adapter.hpp | 20 + .../source/adapters/offload/common.hpp | 7 + .../source/adapters/offload/context.cpp | 28 ++ .../source/adapters/offload/context.hpp | 19 + .../source/adapters/offload/device.cpp | 159 +++++++ .../source/adapters/offload/enqueue.cpp | 51 +++ .../source/adapters/offload/event.cpp | 36 ++ .../source/adapters/offload/event.hpp | 10 + .../source/adapters/offload/kernel.cpp | 70 +++ .../source/adapters/offload/kernel.hpp | 52 +++ .../source/adapters/offload/platform.cpp | 108 +++++ .../source/adapters/offload/program.cpp | 101 +++++ .../source/adapters/offload/program.hpp | 10 + .../source/adapters/offload/queue.cpp | 49 +++ .../source/adapters/offload/queue.hpp | 11 + .../source/adapters/offload/ur2offload.hpp | 29 ++ .../adapters/offload/ur_interface_loader.cpp | 412 ++++++++++++++++++ .../source/adapters/offload/usm.cpp | 53 +++ 22 files changed, 1388 insertions(+) create mode 100644 unified-runtime/source/adapters/offload/.clang-format create mode 100644 unified-runtime/source/adapters/offload/CMakeLists.txt create mode 100644 unified-runtime/source/adapters/offload/adapter.cpp create mode 100644 unified-runtime/source/adapters/offload/adapter.hpp create mode 100644 unified-runtime/source/adapters/offload/common.hpp create mode 100644 unified-runtime/source/adapters/offload/context.cpp create mode 100644 unified-runtime/source/adapters/offload/context.hpp create mode 100644 unified-runtime/source/adapters/offload/device.cpp create mode 100644 unified-runtime/source/adapters/offload/enqueue.cpp create mode 100644 unified-runtime/source/adapters/offload/event.cpp create mode 100644 unified-runtime/source/adapters/offload/event.hpp create mode 100644 unified-runtime/source/adapters/offload/kernel.cpp create mode 100644 unified-runtime/source/adapters/offload/kernel.hpp create mode 100644 unified-runtime/source/adapters/offload/platform.cpp create mode 100644 unified-runtime/source/adapters/offload/program.cpp create mode 100644 unified-runtime/source/adapters/offload/program.hpp create mode 100644 unified-runtime/source/adapters/offload/queue.cpp create mode 100644 unified-runtime/source/adapters/offload/queue.hpp create mode 100644 unified-runtime/source/adapters/offload/ur2offload.hpp create mode 100644 unified-runtime/source/adapters/offload/ur_interface_loader.cpp create mode 100644 unified-runtime/source/adapters/offload/usm.cpp diff --git a/unified-runtime/source/adapters/CMakeLists.txt b/unified-runtime/source/adapters/CMakeLists.txt index 56e053d29bc3e..34ba19b6e8859 100644 --- a/unified-runtime/source/adapters/CMakeLists.txt +++ b/unified-runtime/source/adapters/CMakeLists.txt @@ -68,9 +68,14 @@ if(UR_BUILD_ADAPTER_OPENCL OR UR_BUILD_ADAPTER_ALL) add_ur_adapter_subdirectory(opencl) list(APPEND TEMP_LIST "opencl") endif() + if(UR_BUILD_ADAPTER_NATIVE_CPU OR UR_BUILD_ADAPTER_ALL) add_ur_adapter_subdirectory(native_cpu) list(APPEND TEMP_LIST "native_cpu") endif() +if(UR_BUILD_ADAPTER_OFFLOAD) + add_ur_adapter_subdirectory(offload) +endif() + set(UR_ADAPTERS_LIST "${TEMP_LIST}" CACHE STRING "" FORCE) diff --git a/unified-runtime/source/adapters/offload/.clang-format b/unified-runtime/source/adapters/offload/.clang-format new file mode 100644 index 0000000000000..c8daebc205b34 --- /dev/null +++ b/unified-runtime/source/adapters/offload/.clang-format @@ -0,0 +1,4 @@ +--- +Language: Cpp +BasedOnStyle: LLVM +... diff --git a/unified-runtime/source/adapters/offload/CMakeLists.txt b/unified-runtime/source/adapters/offload/CMakeLists.txt new file mode 100644 index 0000000000000..d3559c2ae7761 --- /dev/null +++ b/unified-runtime/source/adapters/offload/CMakeLists.txt @@ -0,0 +1,58 @@ +set(UR_OFFLOAD_ADAPTER_DIR "${CMAKE_CURRENT_SOURCE_DIR}" CACHE PATH "Offload adapter directory") # TODO + +set(TARGET_NAME ur_adapter_offload) + +set(UR_OFFLOAD_INSTALL_DIR "" CACHE PATH "Path to the directory containing libomptarget.so etc") +if (UR_OFFLOAD_INSTALL_DIR STREQUAL "") + message(FATAL_ERROR "UR_OFFLOAD_INSTALL_DIR must be defined for the Offload adapter") +endif() + +set(UR_OFFLOAD_INCLUDE_DIR "" CACHE PATH "Path to the directory containing LLVM headers") +if (UR_OFFLOAD_INCLUDE_DIR STREQUAL "") + message(FATAL_ERROR "UR_OFFLOAD_INCLUDE_DIR must be defined for the Offload adapter") +endif() + +# For the PTX workaround we need to link with CUDA. +if (NOT TARGET cudadrv) + find_package(CUDA 10.1 REQUIRED) + add_library(cudadrv SHARED IMPORTED GLOBAL) + set_target_properties( + cudadrv PROPERTIES + IMPORTED_LOCATION ${CUDA_CUDA_LIBRARY} + INTERFACE_INCLUDE_DIRECTORIES ${CUDA_INCLUDE_DIRS} + ) +endif() + +add_ur_adapter(${TARGET_NAME} + SHARED + ${CMAKE_CURRENT_SOURCE_DIR}/adapter.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/context.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/enqueue.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/event.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/platform.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/program.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/queue.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/ur2offload.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/ur_interface_loader.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/usm.cpp +) + +set_target_properties(${TARGET_NAME} PROPERTIES + VERSION "${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR}.${PROJECT_VERSION_PATCH}" + SOVERSION "${PROJECT_VERSION_MAJOR}" +) + +target_link_libraries(${TARGET_NAME} PRIVATE + ${PROJECT_NAME}::headers + ${PROJECT_NAME}::common + ${PROJECT_NAME}::umf + ${UR_OFFLOAD_INSTALL_DIR}/lib/libLLVMOffload.so + cudadrv +) + +target_include_directories(${TARGET_NAME} PRIVATE + "${UR_OFFLOAD_INCLUDE_DIR}/offload" + "${CMAKE_CURRENT_SOURCE_DIR}/../../" +) diff --git a/unified-runtime/source/adapters/offload/adapter.cpp b/unified-runtime/source/adapters/offload/adapter.cpp new file mode 100644 index 0000000000000..bde5dc8e8e6a5 --- /dev/null +++ b/unified-runtime/source/adapters/offload/adapter.cpp @@ -0,0 +1,96 @@ +//===----------- adapter.cpp - LLVM Offload Plugin -----------------------===// +// +// Copyright (C) 2024 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include + +#include "adapter.hpp" +#include "ur/ur.hpp" +#include "ur_api.h" + +ur_adapter_handle_t_ Adapter{}; + +// Initialize liboffload and perform the initial platform and device discovery +ur_result_t ur_adapter_handle_t_::init() { + auto Res = olInit(); + + // Discover every platform that isn't the host platform. + // Use an unordered_set to deduplicate platforms we discover multiple times + // from different devices. + // Also discover the host device. We only expect one so don't need to worry + // about overwriting it. + Res = olIterateDevices( + [](ol_device_handle_t D, void *UserData) { + auto Adapter = static_cast(UserData); + ol_platform_handle_t Platform; + olGetDeviceInfo(D, OL_DEVICE_INFO_PLATFORM, sizeof(Platform), + &Platform); + ol_platform_backend_t Backend; + olGetPlatformInfo(Platform, OL_PLATFORM_INFO_BACKEND, sizeof(Backend), + &Backend); + if (Backend == OL_PLATFORM_BACKEND_HOST) { + Adapter->HostDevice = D; + } else if (Backend != OL_PLATFORM_BACKEND_UNKNOWN) { + Adapter->Platforms.insert(Platform); + } + return false; + }, + this); + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urAdapterGet( + uint32_t, ur_adapter_handle_t *phAdapters, uint32_t *pNumAdapters) { + if (phAdapters) { + if (++Adapter.RefCount == 1) { + Adapter.init(); + } + *phAdapters = &Adapter; + } + if (pNumAdapters) { + *pNumAdapters = 1; + } + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urAdapterRelease(ur_adapter_handle_t) { + if (--Adapter.RefCount == 0) { + // This can crash when tracing is enabled. + // olShutDown(); + }; + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urAdapterRetain(ur_adapter_handle_t) { + Adapter.RefCount++; + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urAdapterGetInfo(ur_adapter_handle_t, + ur_adapter_info_t propName, + size_t propSize, + void *pPropValue, + size_t *pPropSizeRet) { + UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); + + switch (propName) { + case UR_ADAPTER_INFO_BACKEND: + return ReturnValue(UR_ADAPTER_BACKEND_CUDA); // TODO: Return a proper value + case UR_ADAPTER_INFO_REFERENCE_COUNT: + return ReturnValue(Adapter.RefCount.load()); + default: + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } + + return UR_RESULT_SUCCESS; +} diff --git a/unified-runtime/source/adapters/offload/adapter.hpp b/unified-runtime/source/adapters/offload/adapter.hpp new file mode 100644 index 0000000000000..be36aceaa8410 --- /dev/null +++ b/unified-runtime/source/adapters/offload/adapter.hpp @@ -0,0 +1,20 @@ +#pragma once + +#include +#include +#include + +#include + +#include "logger/ur_logger.hpp" + +struct ur_adapter_handle_t_ { + std::atomic_uint32_t RefCount = 0; + logger::Logger &Logger = logger::get_logger("offload"); + ol_device_handle_t HostDevice = nullptr; + std::unordered_set Platforms; + + ur_result_t init(); +}; + +extern ur_adapter_handle_t_ Adapter; diff --git a/unified-runtime/source/adapters/offload/common.hpp b/unified-runtime/source/adapters/offload/common.hpp new file mode 100644 index 0000000000000..69aa6bff11e9f --- /dev/null +++ b/unified-runtime/source/adapters/offload/common.hpp @@ -0,0 +1,7 @@ +#pragma once + +#include + +struct RefCounted { + std::atomic_uint32_t RefCount = 1; +}; diff --git a/unified-runtime/source/adapters/offload/context.cpp b/unified-runtime/source/adapters/offload/context.cpp new file mode 100644 index 0000000000000..01d015038c3b1 --- /dev/null +++ b/unified-runtime/source/adapters/offload/context.cpp @@ -0,0 +1,28 @@ +#include "context.hpp" +#include + +UR_APIEXPORT ur_result_t UR_APICALL urContextCreate( + uint32_t DeviceCount, const ur_device_handle_t *phDevices, + const ur_context_properties_t *, ur_context_handle_t *phContext) { + if (DeviceCount > 1) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + auto Ctx = new ur_context_handle_t_(*phDevices); + *phContext = Ctx; + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urContextRetain(ur_context_handle_t hContext) { + hContext->RefCount++; + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urContextRelease(ur_context_handle_t hContext) { + if (--hContext->RefCount == 0) { + delete hContext; + } + return UR_RESULT_SUCCESS; +} diff --git a/unified-runtime/source/adapters/offload/context.hpp b/unified-runtime/source/adapters/offload/context.hpp new file mode 100644 index 0000000000000..9483ec1b4a8b8 --- /dev/null +++ b/unified-runtime/source/adapters/offload/context.hpp @@ -0,0 +1,19 @@ +#pragma once + +#include +#include +#include +#include + +struct ur_context_handle_t_ { + ur_context_handle_t_(ur_device_handle_t hDevice) : Device{hDevice} { + urDeviceRetain(Device); + } + ~ur_context_handle_t_() { + urDeviceRelease(Device); + } + + ur_device_handle_t Device; + std::atomic_uint32_t RefCount; + std::unordered_map AllocTypeMap; +}; diff --git a/unified-runtime/source/adapters/offload/device.cpp b/unified-runtime/source/adapters/offload/device.cpp new file mode 100644 index 0000000000000..54cc60c6fe110 --- /dev/null +++ b/unified-runtime/source/adapters/offload/device.cpp @@ -0,0 +1,159 @@ +#include +#include +#include + +#include "ur2offload.hpp" + +UR_APIEXPORT ur_result_t UR_APICALL urDeviceGet(ur_platform_handle_t hPlatform, + ur_device_type_t, + uint32_t NumEntries, + ur_device_handle_t *phDevices, + uint32_t *pNumDevices) { + + uint32_t NumDevices = 0; + // Pass a few things to the callback (we can't use a lambda with captures) + using ParamsT = struct { + uint32_t DeviceLimit; + uint32_t &NumDevices; + ol_platform_handle_t Platform; + ol_device_handle_t *DevicesOut; + }; + ParamsT Params = {NumEntries, NumDevices, + reinterpret_cast(hPlatform), + reinterpret_cast(phDevices)}; + + olIterateDevices( + [](ol_device_handle_t D, void *Data) { + auto Params = reinterpret_cast(Data); + ol_platform_handle_t Platform = nullptr; + olGetDeviceInfo(D, OL_DEVICE_INFO_PLATFORM, sizeof(Platform), + &Platform); + if (Platform == Params->Platform) { + if (Params->DevicesOut) { + Params->DevicesOut[Params->NumDevices] = D; + } + Params->NumDevices++; + } + return Params->NumDevices == Params->DeviceLimit; + }, + &Params); + + if (pNumDevices) { + *pNumDevices = NumDevices; + } + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, + ur_device_info_t propName, + size_t propSize, + void *pPropValue, + size_t *pPropSizeRet) { + UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); + + ol_device_info_t olInfo; + switch (propName) { + case UR_DEVICE_INFO_NAME: + olInfo = OL_DEVICE_INFO_NAME; + break; + case UR_DEVICE_INFO_PARENT_DEVICE: + return ReturnValue(nullptr); + case UR_DEVICE_INFO_VERSION: + return ReturnValue(""); + case UR_DEVICE_INFO_EXTENSIONS: + return ReturnValue(""); + case UR_DEVICE_INFO_USE_NATIVE_ASSERT: + return ReturnValue(false); + case UR_DEVICE_INFO_TYPE: + olInfo = OL_DEVICE_INFO_TYPE; + break; + case UR_DEVICE_INFO_VENDOR: + olInfo = OL_DEVICE_INFO_VENDOR; + break; + case UR_DEVICE_INFO_DRIVER_VERSION: + olInfo = OL_DEVICE_INFO_DRIVER_VERSION; + break; + case UR_DEVICE_INFO_PLATFORM: + olInfo = OL_DEVICE_INFO_PLATFORM; + break; + case UR_DEVICE_INFO_USM_SINGLE_SHARED_SUPPORT: + return ReturnValue(UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS); + case UR_DEVICE_INFO_BUILD_ON_SUBDEVICE: + return ReturnValue(false); + default: + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; + } + + if (pPropSizeRet) { + if (auto Res = + olGetDeviceInfoSize(reinterpret_cast(hDevice), + olInfo, pPropSizeRet)) { + return offloadResultToUR(Res); + } + } + + if (pPropValue) { + if (auto Res = + olGetDeviceInfo(reinterpret_cast(hDevice), + olInfo, propSize, pPropValue)) { + return offloadResultToUR(Res); + } + // Need to explicitly map this type + if (olInfo == OL_DEVICE_INFO_TYPE) { + auto urPropPtr = reinterpret_cast(pPropValue); + auto olPropPtr = reinterpret_cast(pPropValue); + + switch (*olPropPtr) { + case OL_DEVICE_TYPE_CPU: + *urPropPtr = UR_DEVICE_TYPE_CPU; + break; + case OL_DEVICE_TYPE_GPU: + *urPropPtr = UR_DEVICE_TYPE_GPU; + break; + default: + break; + } + } + } + + return UR_RESULT_SUCCESS; +} + +// Device partitioning is not supported in Offload, and won't be for some time. +// This means urDeviceRetain/Release are no-ops because all devices are root +// devices. + +UR_APIEXPORT ur_result_t UR_APICALL urDeviceRetain(ur_device_handle_t) { + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urDeviceRelease(ur_device_handle_t) { + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urDevicePartition(ur_device_handle_t, const ur_device_partition_properties_t *, + uint32_t, ur_device_handle_t *, uint32_t *) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urDeviceSelectBinary( + ur_device_handle_t hDevice, const ur_device_binary_t *pBinaries, + uint32_t NumBinaries, uint32_t *pSelectedBinary) { + std::ignore = hDevice; + std::ignore = pBinaries; + std::ignore = NumBinaries; + std::ignore = pSelectedBinary; + + // TODO: Don't hard code nvptx64! + const char *image_target = UR_DEVICE_BINARY_TARGET_NVPTX64; + for (uint32_t i = 0; i < NumBinaries; ++i) { + if (strcmp(pBinaries[i].pDeviceTargetSpec, image_target) == 0) { + *pSelectedBinary = i; + return UR_RESULT_SUCCESS; + } + } + + // No image can be loaded for the given device + return UR_RESULT_ERROR_INVALID_BINARY; +} diff --git a/unified-runtime/source/adapters/offload/enqueue.cpp b/unified-runtime/source/adapters/offload/enqueue.cpp new file mode 100644 index 0000000000000..30f5a099429fa --- /dev/null +++ b/unified-runtime/source/adapters/offload/enqueue.cpp @@ -0,0 +1,51 @@ +#include +#include +#include + +#include "event.hpp" +#include "kernel.hpp" +#include "queue.hpp" +#include "ur2offload.hpp" + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( + ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + // Ignore wait list for now + (void)numEventsInWaitList; + (void)phEventWaitList; + // + + (void) pGlobalWorkOffset; + (void) pLocalWorkSize; + + assert(workDim == 1); + + ol_kernel_launch_size_args_t LaunchArgs; + LaunchArgs.Dimensions = workDim; + LaunchArgs.NumGroupsX = pGlobalWorkSize[0]; + LaunchArgs.NumGroupsY = 1; + LaunchArgs.NumGroupsZ = 1; + LaunchArgs.GroupSizeX = 1; + LaunchArgs.GroupSizeY = 1; + LaunchArgs.GroupSizeZ = 1; + LaunchArgs.DynSharedMemory = 0; + + ol_event_handle_t EventOut; + auto Ret = + olLaunchKernel(hQueue->OffloadQueue, hQueue->OffloadDevice, + hKernel->OffloadKernel, hKernel->Args.getStorage(), + hKernel->Args.getStorageSize(), &LaunchArgs, &EventOut); + + if (Ret != OL_SUCCESS) { + return offloadResultToUR(Ret); + } + + if (phEvent) { + auto *Event = new ur_event_handle_t_(); + Event->OffloadEvent = EventOut; + *phEvent = Event; + } + return UR_RESULT_SUCCESS; +} diff --git a/unified-runtime/source/adapters/offload/event.cpp b/unified-runtime/source/adapters/offload/event.cpp new file mode 100644 index 0000000000000..5dec5fa29d113 --- /dev/null +++ b/unified-runtime/source/adapters/offload/event.cpp @@ -0,0 +1,36 @@ +#include +#include + +#include "event.hpp" +#include "ur2offload.hpp" + +UR_APIEXPORT ur_result_t UR_APICALL +urEventWait(uint32_t numEvents, const ur_event_handle_t *phEventWaitList) { + for (uint32_t i = 0; i < numEvents; i++) { + auto Res = olWaitEvent(phEventWaitList[i]->OffloadEvent); + if (Res) { + return offloadResultToUR(Res); + } + } + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEventRetain(ur_event_handle_t hEvent) { + hEvent->RefCount++; + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEventRelease(ur_event_handle_t hEvent) { + if (--hEvent->RefCount == 0) { + // There's a small bug in olDestroyEvent that will crash. Leak the event + // in the meantime. + // auto Res = olDestroyEvent(hEvent->OffloadEvent); + // if (Res) { + // return offloadResultToUR(Res); + // } + } + + delete hEvent; + return UR_RESULT_SUCCESS; +} diff --git a/unified-runtime/source/adapters/offload/event.hpp b/unified-runtime/source/adapters/offload/event.hpp new file mode 100644 index 0000000000000..95f692214e6f1 --- /dev/null +++ b/unified-runtime/source/adapters/offload/event.hpp @@ -0,0 +1,10 @@ +#pragma once + +#include +#include + +#include "common.hpp" + +struct ur_event_handle_t_ : RefCounted { + ol_event_handle_t OffloadEvent; +}; diff --git a/unified-runtime/source/adapters/offload/kernel.cpp b/unified-runtime/source/adapters/offload/kernel.cpp new file mode 100644 index 0000000000000..6ab95aa6640da --- /dev/null +++ b/unified-runtime/source/adapters/offload/kernel.cpp @@ -0,0 +1,70 @@ +#include "kernel.hpp" +#include "program.hpp" +#include "ur2offload.hpp" +#include +#include +#include + +UR_APIEXPORT ur_result_t UR_APICALL +urKernelCreate(ur_program_handle_t hProgram, const char *pKernelName, + ur_kernel_handle_t *phKernel) { + ur_kernel_handle_t Kernel = new ur_kernel_handle_t_; + + auto Res = olGetKernel(hProgram->OffloadProgram, pKernelName, + &Kernel->OffloadKernel); + + if (Res != OL_SUCCESS) { + delete Kernel; + return offloadResultToUR(Res); + } + + *phKernel = Kernel; + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urKernelRetain(ur_kernel_handle_t hKernel) { + hKernel->RefCount++; + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urKernelRelease(ur_kernel_handle_t hKernel) { + if (--hKernel->RefCount == 0) { + delete hKernel; + } + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urKernelSetExecInfo(ur_kernel_handle_t, ur_kernel_exec_info_t, size_t, + const ur_kernel_exec_info_properties_t *, const void *) { + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgPointer( + ur_kernel_handle_t hKernel, uint32_t argIndex, + const ur_kernel_arg_pointer_properties_t *, const void *pArgValue) { + hKernel->Args.addArg(argIndex, sizeof(pArgValue), &pArgValue); + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgValue( + ur_kernel_handle_t hKernel, uint32_t argIndex, size_t argSize, + const ur_kernel_arg_value_properties_t *, const void *pArgValue) { + hKernel->Args.addArg(argIndex, argSize, pArgValue); + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urKernelGetGroupInfo(ur_kernel_handle_t, ur_device_handle_t, + ur_kernel_group_info_t propName, size_t propSize, + void *pPropValue, size_t *pPropSizeRet) { + UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); + + if (propName == UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE) { + size_t GroupSize[3] = {0, 0, 0}; + return ReturnValue(GroupSize, 3); + } + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; +} diff --git a/unified-runtime/source/adapters/offload/kernel.hpp b/unified-runtime/source/adapters/offload/kernel.hpp new file mode 100644 index 0000000000000..dee293aaa1b44 --- /dev/null +++ b/unified-runtime/source/adapters/offload/kernel.hpp @@ -0,0 +1,52 @@ +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "common.hpp" + +struct ur_kernel_handle_t_ : RefCounted { + + // Simplified version of the CUDA adapter's argument implementation + struct OffloadKernelArguments { + static constexpr size_t MaxParamBytes = 4096u; + using args_t = std::array; + using args_size_t = std::vector; + using args_ptr_t = std::vector; + args_t Storage; + size_t StorageUsed = 0; + args_size_t ParamSizes; + args_ptr_t Pointers; + + // Add an argument. If it already exists, it is replaced. Gaps are filled + // with empty arguments. + void addArg(size_t Index, size_t Size, const void *Arg) { + if (Index + 1 > Pointers.size()) { + Pointers.resize(Index + 1); + ParamSizes.resize(Index + 1); + } + ParamSizes[Index] = Size; + // Calculate the insertion point in the array. + size_t InsertPos = std::accumulate(std::begin(ParamSizes), + std::begin(ParamSizes) + Index, 0); + // Update the stored value for the argument. + std::memcpy(&Storage[InsertPos], Arg, Size); + Pointers[Index] = &Storage[InsertPos]; + } + + const args_ptr_t &getPointers() const noexcept { return Pointers; } + + const char *getStorage() const noexcept { return Storage.data(); } + + size_t getStorageSize() const noexcept { + return std::accumulate(std::begin(ParamSizes), std::end(ParamSizes), 0); + } + }; + + ol_kernel_handle_t OffloadKernel; + OffloadKernelArguments Args{}; +}; diff --git a/unified-runtime/source/adapters/offload/platform.cpp b/unified-runtime/source/adapters/offload/platform.cpp new file mode 100644 index 0000000000000..50b9ac90649ce --- /dev/null +++ b/unified-runtime/source/adapters/offload/platform.cpp @@ -0,0 +1,108 @@ +#include +#include +#include +#include + +#include "adapter.hpp" +#include "ur2offload.hpp" + +UR_APIEXPORT ur_result_t UR_APICALL +urPlatformGet(ur_adapter_handle_t, uint32_t NumEntries, + ur_platform_handle_t *phPlatforms, uint32_t *pNumPlatforms) { + + if (pNumPlatforms) { + *pNumPlatforms = Adapter.Platforms.size(); + } + + if (phPlatforms) { + size_t PlatformIndex = 0; + for (auto &Platform : Adapter.Platforms) { + phPlatforms[PlatformIndex++] = + reinterpret_cast(Platform); + if (PlatformIndex == NumEntries) { + break; + } + } + } + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urPlatformGetInfo(ur_platform_handle_t hPlatform, ur_platform_info_t propName, + size_t propSize, void *pPropValue, size_t *pPropSizeRet) { + UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); + + ol_platform_info_t olInfo; + switch (propName) { + case UR_PLATFORM_INFO_NAME: + olInfo = OL_PLATFORM_INFO_NAME; + break; + case UR_PLATFORM_INFO_VENDOR_NAME: + olInfo = OL_PLATFORM_INFO_VENDOR_NAME; + break; + case UR_PLATFORM_INFO_VERSION: + olInfo = OL_PLATFORM_INFO_VERSION; + break; + case UR_PLATFORM_INFO_EXTENSIONS: + return ReturnValue(""); + case UR_PLATFORM_INFO_PROFILE: + return ReturnValue("FULL_PROFILE"); + case UR_PLATFORM_INFO_BACKEND: + olInfo = OL_PLATFORM_INFO_BACKEND; + break; + default: + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } + + if (pPropSizeRet) { + if (auto Res = olGetPlatformInfoSize( + reinterpret_cast(hPlatform), olInfo, + pPropSizeRet)) { + return offloadResultToUR(Res); + } + } + + if (pPropValue) { + if (auto Res = + olGetPlatformInfo(reinterpret_cast(hPlatform), + olInfo, propSize, pPropValue)) { + return offloadResultToUR(Res); + } + + // Need to explicitly map this type + if (olInfo == OL_PLATFORM_INFO_BACKEND) { + auto urPropPtr = reinterpret_cast(pPropValue); + auto olPropPtr = reinterpret_cast(pPropValue); + + switch (*olPropPtr) { + case OL_PLATFORM_BACKEND_CUDA: + *urPropPtr = UR_PLATFORM_BACKEND_CUDA; + break; + case OL_PLATFORM_BACKEND_AMDGPU: + *urPropPtr = UR_PLATFORM_BACKEND_HIP; + break; + default: + *urPropPtr = UR_PLATFORM_BACKEND_UNKNOWN; + break; + } + } + } + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urPlatformGetBackendOption(ur_platform_handle_t, const char *pFrontendOption, + const char **ppPlatformOption) { + using namespace std::literals; + if (pFrontendOption == nullptr) + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + if (pFrontendOption == "-O0"sv || pFrontendOption == "-O1"sv || + pFrontendOption == "-O2"sv || pFrontendOption == "-O3"sv || + pFrontendOption == ""sv) { + *ppPlatformOption = ""; + return UR_RESULT_SUCCESS; + } + return UR_RESULT_ERROR_INVALID_VALUE; +} diff --git a/unified-runtime/source/adapters/offload/program.cpp b/unified-runtime/source/adapters/offload/program.cpp new file mode 100644 index 0000000000000..a55644efdd4f7 --- /dev/null +++ b/unified-runtime/source/adapters/offload/program.cpp @@ -0,0 +1,101 @@ +#include +#include +#include +#include + +#include "context.hpp" +#include "program.hpp" +#include "ur2offload.hpp" + +UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary( + ur_context_handle_t hContext, uint32_t numDevices, + ur_device_handle_t *phDevices, size_t *pLengths, const uint8_t **ppBinaries, + const ur_program_properties_t *, ur_program_handle_t *phProgram) { + if (numDevices > 1) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + // Workaround for Offload not supporting PTX binaries. Force CUDA programs + // to be linked so they end up as CUBIN. + uint8_t *RealBinary; + size_t RealLength; + ur_platform_handle_t DevicePlatform; + bool DidLink = false; + CUlinkState State; + urDeviceGetInfo(phDevices[0], UR_DEVICE_INFO_PLATFORM, + sizeof(ur_platform_handle_t), &DevicePlatform, nullptr); + ur_platform_backend_t PlatformBackend; + urPlatformGetInfo(DevicePlatform, UR_PLATFORM_INFO_BACKEND, + sizeof(ur_platform_backend_t), &PlatformBackend, nullptr); + if (PlatformBackend == UR_PLATFORM_BACKEND_CUDA) { + cuLinkCreate(0, nullptr, nullptr, &State); + + cuLinkAddData(State, CU_JIT_INPUT_PTX, (char *)(ppBinaries[0]), pLengths[0], + nullptr, 0, nullptr, nullptr); + + void *CuBin = nullptr; + size_t CuBinSize = 0; + cuLinkComplete(State, &CuBin, &CuBinSize); + RealBinary = (uint8_t *)CuBin; + RealLength = CuBinSize; + DidLink = true; + fprintf(stderr, "Performed CUDA bin workaround (size = %lu)\n", RealLength); + } else { + RealBinary = const_cast(ppBinaries[0]); + RealLength = pLengths[0]; + } + + ur_program_handle_t Program = new ur_program_handle_t_(); + auto Res = + olCreateProgram(reinterpret_cast(hContext->Device), + RealBinary, RealLength, &Program->OffloadProgram); + + // Program owns the linked module now + if (DidLink) { + cuLinkDestroy(State); + } + + if (Res != OL_SUCCESS) { + delete Program; + return offloadResultToUR(Res); + } + + *phProgram = Program; + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urProgramBuild(ur_context_handle_t, + ur_program_handle_t, + const char *) { + // Do nothing, program is built upon creation + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urProgramBuildExp(ur_program_handle_t, + uint32_t, + ur_device_handle_t *, + const char *) { + // Do nothing, program is built upon creation + return UR_RESULT_SUCCESS; +} + + +UR_APIEXPORT ur_result_t UR_APICALL +urProgramRetain(ur_program_handle_t hProgram) { + hProgram->RefCount++; + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urProgramRelease(ur_program_handle_t hProgram) { + if (--hProgram->RefCount == 0) { + auto Res = olDestroyProgram(hProgram->OffloadProgram); + if (Res) { + return offloadResultToUR(Res); + } + delete hProgram; + } + + return UR_RESULT_SUCCESS; +} diff --git a/unified-runtime/source/adapters/offload/program.hpp b/unified-runtime/source/adapters/offload/program.hpp new file mode 100644 index 0000000000000..0639ab336c5fb --- /dev/null +++ b/unified-runtime/source/adapters/offload/program.hpp @@ -0,0 +1,10 @@ +#pragma once + +#include +#include + +#include "common.hpp" + +struct ur_program_handle_t_ : RefCounted { + ol_program_handle_t OffloadProgram; +}; diff --git a/unified-runtime/source/adapters/offload/queue.cpp b/unified-runtime/source/adapters/offload/queue.cpp new file mode 100644 index 0000000000000..2900deab68c60 --- /dev/null +++ b/unified-runtime/source/adapters/offload/queue.cpp @@ -0,0 +1,49 @@ +#include +#include +#include + +#include "context.hpp" +#include "queue.hpp" +#include "ur2offload.hpp" + +UR_APIEXPORT ur_result_t UR_APICALL urQueueCreate( + [[maybe_unused]] ur_context_handle_t hContext, ur_device_handle_t hDevice, + const ur_queue_properties_t *, ur_queue_handle_t *phQueue) { + + assert(hContext->Device == hDevice); + + ur_queue_handle_t Queue = new ur_queue_handle_t_(); + auto Res = olCreateQueue(reinterpret_cast(hDevice), + &Queue->OffloadQueue); + if (Res != OL_SUCCESS) { + delete Queue; + return offloadResultToUR(Res); + } + + Queue->OffloadDevice = reinterpret_cast(hDevice); + + *phQueue = Queue; + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urQueueRetain(ur_queue_handle_t hQueue) { + hQueue->RefCount++; + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urQueueRelease(ur_queue_handle_t hQueue) { + if (--hQueue->RefCount == 0) { + auto Res = olDestroyQueue(hQueue->OffloadQueue); + if (Res) { + return offloadResultToUR(Res); + } + delete hQueue; + } + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urQueueFinish(ur_queue_handle_t hQueue) { + return offloadResultToUR(olWaitQueue(hQueue->OffloadQueue)); +} diff --git a/unified-runtime/source/adapters/offload/queue.hpp b/unified-runtime/source/adapters/offload/queue.hpp new file mode 100644 index 0000000000000..9406d460b7401 --- /dev/null +++ b/unified-runtime/source/adapters/offload/queue.hpp @@ -0,0 +1,11 @@ +#pragma once + +#include +#include + +#include "common.hpp" + +struct ur_queue_handle_t_ : RefCounted { + ol_queue_handle_t OffloadQueue; + ol_device_handle_t OffloadDevice; +}; diff --git a/unified-runtime/source/adapters/offload/ur2offload.hpp b/unified-runtime/source/adapters/offload/ur2offload.hpp new file mode 100644 index 0000000000000..2e9835bc480d0 --- /dev/null +++ b/unified-runtime/source/adapters/offload/ur2offload.hpp @@ -0,0 +1,29 @@ +//===--------- ur2offload.hpp - LLVM Offload Adapter ----------------------===// +// +// Copyright (C) 2024 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include + +inline ur_result_t offloadResultToUR(ol_result_t Result) { + if (Result == OL_SUCCESS) { + return UR_RESULT_SUCCESS; + } + + switch (Result->Code) { + case OL_ERRC_INVALID_NULL_HANDLE: + return UR_RESULT_ERROR_INVALID_NULL_HANDLE; + case OL_ERRC_INVALID_NULL_POINTER: + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + case OL_ERRC_UNSUPPORTED_ENUMERATION: + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; + default: + return UR_RESULT_ERROR_UNKNOWN; + } +} diff --git a/unified-runtime/source/adapters/offload/ur_interface_loader.cpp b/unified-runtime/source/adapters/offload/ur_interface_loader.cpp new file mode 100644 index 0000000000000..789bd653ea4bb --- /dev/null +++ b/unified-runtime/source/adapters/offload/ur_interface_loader.cpp @@ -0,0 +1,412 @@ +//===----------- ur_interface_loader.cpp - LLVM Offload Plugin -----------===// +// +// Copyright (C) 2024 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include + +namespace { + +// TODO - this is a duplicate of what is in the L0 plugin +// We should move this to somewhere common +ur_result_t validateProcInputs(ur_api_version_t version, void *pDdiTable) { + if (pDdiTable == nullptr) { + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + } + // Pre 1.0 we enforce that loader and adapter must have the same version. + // Post 1.0 only a major version match should be required. + if (version != UR_API_VERSION_CURRENT) { + return UR_RESULT_ERROR_UNSUPPORTED_VERSION; + } + return UR_RESULT_SUCCESS; +} +} // namespace + +extern "C" { + +UR_DLLEXPORT ur_result_t UR_APICALL urGetPlatformProcAddrTable( + ur_api_version_t version, ur_platform_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnCreateWithNativeHandle = nullptr; + pDdiTable->pfnGet = urPlatformGet; + pDdiTable->pfnGetApiVersion = nullptr; + pDdiTable->pfnGetInfo = urPlatformGetInfo; + pDdiTable->pfnGetNativeHandle = nullptr; + pDdiTable->pfnGetBackendOption = urPlatformGetBackendOption; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetContextProcAddrTable( + ur_api_version_t version, ur_context_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnCreate = urContextCreate; + pDdiTable->pfnCreateWithNativeHandle = nullptr; + pDdiTable->pfnGetInfo = nullptr; + pDdiTable->pfnGetNativeHandle = nullptr; + pDdiTable->pfnRelease = urContextRelease; + pDdiTable->pfnRetain = urContextRetain; + pDdiTable->pfnSetExtendedDeleter = nullptr; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetEventProcAddrTable( + ur_api_version_t version, ur_event_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnCreateWithNativeHandle = nullptr; + pDdiTable->pfnGetInfo = nullptr; + pDdiTable->pfnGetNativeHandle = nullptr; + pDdiTable->pfnGetProfilingInfo = nullptr; + pDdiTable->pfnRelease = urEventRelease; + pDdiTable->pfnRetain = urEventRetain; + pDdiTable->pfnSetCallback = nullptr; + pDdiTable->pfnWait = urEventWait; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetProgramProcAddrTable( + ur_api_version_t version, ur_program_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnBuild = urProgramBuild; + pDdiTable->pfnCompile = nullptr; + pDdiTable->pfnCreateWithBinary = urProgramCreateWithBinary; + pDdiTable->pfnCreateWithIL = nullptr; + pDdiTable->pfnCreateWithNativeHandle = nullptr; + pDdiTable->pfnGetBuildInfo = nullptr; + pDdiTable->pfnGetFunctionPointer = nullptr; + pDdiTable->pfnGetGlobalVariablePointer = nullptr; + pDdiTable->pfnGetInfo = nullptr; + pDdiTable->pfnGetNativeHandle = nullptr; + pDdiTable->pfnLink = nullptr; + pDdiTable->pfnRelease = urProgramRelease; + pDdiTable->pfnRetain = urProgramRetain; + pDdiTable->pfnSetSpecializationConstants = nullptr; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( + ur_api_version_t version, ur_kernel_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnCreate = urKernelCreate; + pDdiTable->pfnCreateWithNativeHandle = nullptr; + pDdiTable->pfnGetGroupInfo = urKernelGetGroupInfo; + pDdiTable->pfnGetInfo = nullptr; + pDdiTable->pfnGetNativeHandle = nullptr; + pDdiTable->pfnGetSubGroupInfo = nullptr; + pDdiTable->pfnRelease = urKernelRelease; + pDdiTable->pfnRetain = urKernelRetain; + pDdiTable->pfnSetArgLocal = nullptr; + pDdiTable->pfnSetArgMemObj = nullptr; + pDdiTable->pfnSetArgPointer = urKernelSetArgPointer; + pDdiTable->pfnSetArgSampler = nullptr; + pDdiTable->pfnSetArgValue = urKernelSetArgValue; + pDdiTable->pfnSetExecInfo = urKernelSetExecInfo; + pDdiTable->pfnSetSpecializationConstants = nullptr; + pDdiTable->pfnGetSuggestedLocalWorkSize = nullptr; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetSamplerProcAddrTable( + ur_api_version_t version, ur_sampler_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnCreate = nullptr; + pDdiTable->pfnCreateWithNativeHandle = nullptr; + pDdiTable->pfnGetInfo = nullptr; + pDdiTable->pfnGetNativeHandle = nullptr; + pDdiTable->pfnRelease = nullptr; + pDdiTable->pfnRetain = nullptr; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL +urGetMemProcAddrTable(ur_api_version_t version, ur_mem_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnBufferCreate = nullptr; + pDdiTable->pfnBufferPartition = nullptr; + pDdiTable->pfnBufferCreateWithNativeHandle = nullptr; + pDdiTable->pfnImageCreateWithNativeHandle = nullptr; + pDdiTable->pfnGetInfo = nullptr; + pDdiTable->pfnGetNativeHandle = nullptr; + pDdiTable->pfnImageCreate = nullptr; + pDdiTable->pfnImageGetInfo = nullptr; + pDdiTable->pfnRelease = nullptr; + pDdiTable->pfnRetain = nullptr; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueProcAddrTable( + ur_api_version_t version, ur_enqueue_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnDeviceGlobalVariableRead = nullptr; + pDdiTable->pfnDeviceGlobalVariableWrite = nullptr; + pDdiTable->pfnEventsWait = nullptr; + pDdiTable->pfnEventsWaitWithBarrier = nullptr; + pDdiTable->pfnKernelLaunch = urEnqueueKernelLaunch; + pDdiTable->pfnMemBufferCopy = nullptr; + pDdiTable->pfnMemBufferCopyRect = nullptr; + pDdiTable->pfnMemBufferFill = nullptr; + pDdiTable->pfnMemBufferMap = nullptr; + pDdiTable->pfnMemBufferRead = nullptr; + pDdiTable->pfnMemBufferReadRect = nullptr; + pDdiTable->pfnMemBufferWrite = nullptr; + pDdiTable->pfnMemBufferWriteRect = nullptr; + pDdiTable->pfnMemImageCopy = nullptr; + pDdiTable->pfnMemImageRead = nullptr; + pDdiTable->pfnMemImageWrite = nullptr; + pDdiTable->pfnMemUnmap = nullptr; + pDdiTable->pfnUSMFill2D = nullptr; + pDdiTable->pfnUSMFill = nullptr; + pDdiTable->pfnUSMAdvise = nullptr; + pDdiTable->pfnUSMMemcpy2D = nullptr; + pDdiTable->pfnUSMMemcpy = nullptr; + pDdiTable->pfnUSMPrefetch = nullptr; + pDdiTable->pfnReadHostPipe = nullptr; + pDdiTable->pfnWriteHostPipe = nullptr; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetGlobalProcAddrTable( + ur_api_version_t version, ur_global_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnAdapterGet = urAdapterGet; + pDdiTable->pfnAdapterGetInfo = urAdapterGetInfo; + pDdiTable->pfnAdapterRelease = urAdapterRelease; + pDdiTable->pfnAdapterRetain = urAdapterRetain; + pDdiTable->pfnAdapterGetLastError = nullptr; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetQueueProcAddrTable( + ur_api_version_t version, ur_queue_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnCreate = urQueueCreate; + pDdiTable->pfnCreateWithNativeHandle = nullptr; + pDdiTable->pfnFinish = urQueueFinish; + pDdiTable->pfnFlush = nullptr; + pDdiTable->pfnGetInfo = nullptr; + pDdiTable->pfnGetNativeHandle = nullptr; + pDdiTable->pfnRelease = urQueueRelease; + pDdiTable->pfnRetain = urQueueRetain; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL +urGetUSMProcAddrTable(ur_api_version_t version, ur_usm_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnDeviceAlloc = urUSMDeviceAlloc; + pDdiTable->pfnFree = urUSMFree; + pDdiTable->pfnGetMemAllocInfo = nullptr; + pDdiTable->pfnHostAlloc = urUSMHostAlloc; + pDdiTable->pfnPoolCreate = nullptr; + pDdiTable->pfnPoolRetain = nullptr; + pDdiTable->pfnPoolRelease = nullptr; + pDdiTable->pfnPoolGetInfo = nullptr; + pDdiTable->pfnSharedAlloc = urUSMSharedAlloc; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetDeviceProcAddrTable( + ur_api_version_t version, ur_device_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnCreateWithNativeHandle = nullptr; + pDdiTable->pfnGet = urDeviceGet; + pDdiTable->pfnGetGlobalTimestamps = nullptr; + pDdiTable->pfnGetInfo = urDeviceGetInfo; + pDdiTable->pfnGetNativeHandle = nullptr; + pDdiTable->pfnPartition = urDevicePartition; + pDdiTable->pfnRelease = urDeviceRelease; + pDdiTable->pfnRetain = urDeviceRetain; + pDdiTable->pfnSelectBinary = urDeviceSelectBinary; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetCommandBufferExpProcAddrTable( + ur_api_version_t version, ur_command_buffer_exp_dditable_t *pDdiTable) { + auto retVal = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != retVal) { + return retVal; + } + pDdiTable->pfnCreateExp = nullptr; + pDdiTable->pfnRetainExp = nullptr; + pDdiTable->pfnReleaseExp = nullptr; + pDdiTable->pfnFinalizeExp = nullptr; + pDdiTable->pfnAppendKernelLaunchExp = nullptr; + pDdiTable->pfnAppendUSMMemcpyExp = nullptr; + pDdiTable->pfnAppendMemBufferCopyExp = nullptr; + pDdiTable->pfnAppendMemBufferCopyRectExp = nullptr; + pDdiTable->pfnAppendMemBufferReadExp = nullptr; + pDdiTable->pfnAppendMemBufferReadRectExp = nullptr; + pDdiTable->pfnAppendMemBufferWriteExp = nullptr; + pDdiTable->pfnAppendMemBufferWriteRectExp = nullptr; + pDdiTable->pfnUpdateKernelLaunchExp = nullptr; + pDdiTable->pfnGetInfoExp = nullptr; + + return retVal; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetUsmP2PExpProcAddrTable( + ur_api_version_t version, ur_usm_p2p_exp_dditable_t *pDdiTable) { + auto retVal = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != retVal) { + return retVal; + } + pDdiTable->pfnEnablePeerAccessExp = nullptr; + pDdiTable->pfnDisablePeerAccessExp = nullptr; + pDdiTable->pfnPeerAccessGetInfoExp = nullptr; + + return retVal; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetBindlessImagesExpProcAddrTable( + ur_api_version_t version, ur_bindless_images_exp_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnUnsampledImageHandleDestroyExp = nullptr; + pDdiTable->pfnSampledImageHandleDestroyExp = nullptr; + pDdiTable->pfnImageAllocateExp = nullptr; + pDdiTable->pfnImageFreeExp = nullptr; + pDdiTable->pfnUnsampledImageCreateExp = nullptr; + pDdiTable->pfnSampledImageCreateExp = nullptr; + pDdiTable->pfnImageCopyExp = nullptr; + pDdiTable->pfnImageGetInfoExp = nullptr; + pDdiTable->pfnMipmapGetLevelExp = nullptr; + pDdiTable->pfnMipmapFreeExp = nullptr; + pDdiTable->pfnImportExternalMemoryExp = nullptr; + pDdiTable->pfnMapExternalArrayExp = nullptr; + pDdiTable->pfnMapExternalLinearMemoryExp = nullptr; + pDdiTable->pfnReleaseExternalMemoryExp = nullptr; + pDdiTable->pfnImportExternalSemaphoreExp = nullptr; + pDdiTable->pfnReleaseExternalSemaphoreExp = nullptr; + pDdiTable->pfnWaitExternalSemaphoreExp = nullptr; + pDdiTable->pfnSignalExternalSemaphoreExp = nullptr; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetPhysicalMemProcAddrTable( + ur_api_version_t version, ur_physical_mem_dditable_t *pDdiTable) { + auto retVal = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != retVal) { + return retVal; + } + + pDdiTable->pfnCreate = nullptr; + pDdiTable->pfnRelease = nullptr; + pDdiTable->pfnRetain = nullptr; + + return retVal; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetUSMExpProcAddrTable( + ur_api_version_t version, ur_usm_exp_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnPitchedAllocExp = nullptr; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetVirtualMemProcAddrTable( + ur_api_version_t version, ///< [in] API version requested + ur_virtual_mem_dditable_t + *pDdiTable ///< [in,out] pointer to table of DDI function pointers +) { + auto retVal = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != retVal) { + return retVal; + } + + pDdiTable->pfnFree = nullptr; + pDdiTable->pfnGetInfo = nullptr; + pDdiTable->pfnGranularityGetInfo = nullptr; + pDdiTable->pfnMap = nullptr; + pDdiTable->pfnReserve = nullptr; + pDdiTable->pfnSetAccess = nullptr; + pDdiTable->pfnUnmap = nullptr; + + return retVal; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( + ur_api_version_t version, ur_enqueue_exp_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + + pDdiTable->pfnCooperativeKernelLaunchExp = nullptr; + pDdiTable->pfnTimestampRecordingExp = nullptr; + pDdiTable->pfnNativeCommandExp = nullptr; + + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelExpProcAddrTable( + ur_api_version_t version, ur_kernel_exp_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + + pDdiTable->pfnSuggestMaxCooperativeGroupCountExp = nullptr; + + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetProgramExpProcAddrTable( + ur_api_version_t version, ur_program_exp_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + + pDdiTable->pfnBuildExp = urProgramBuildExp; + pDdiTable->pfnCompileExp = nullptr; + pDdiTable->pfnLinkExp = nullptr; + + return UR_RESULT_SUCCESS; +} +} // extern "C" diff --git a/unified-runtime/source/adapters/offload/usm.cpp b/unified-runtime/source/adapters/offload/usm.cpp new file mode 100644 index 0000000000000..a597cf87c21b1 --- /dev/null +++ b/unified-runtime/source/adapters/offload/usm.cpp @@ -0,0 +1,53 @@ +#include +#include +#include + +#include "context.hpp" +#include "ur2offload.hpp" + +UR_APIEXPORT ur_result_t UR_APICALL urUSMHostAlloc(ur_context_handle_t hContext, + const ur_usm_desc_t *, + ur_usm_pool_handle_t, + size_t size, void **ppMem) { + auto Res = olMemAlloc(reinterpret_cast(hContext->Device), + OL_ALLOC_TYPE_HOST, size, ppMem); + + if (Res != OL_SUCCESS) { + return offloadResultToUR(Res); + } + + hContext->AllocTypeMap.insert_or_assign(*ppMem, OL_ALLOC_TYPE_HOST); + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urUSMDeviceAlloc( + ur_context_handle_t hContext, ur_device_handle_t, const ur_usm_desc_t *, + ur_usm_pool_handle_t, size_t size, void **ppMem) { + auto Res = olMemAlloc(reinterpret_cast(hContext->Device), + OL_ALLOC_TYPE_DEVICE, size, ppMem); + + if (Res != OL_SUCCESS) { + return offloadResultToUR(Res); + } + + hContext->AllocTypeMap.insert_or_assign(*ppMem, OL_ALLOC_TYPE_DEVICE); + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urUSMSharedAlloc( + ur_context_handle_t hContext, ur_device_handle_t, const ur_usm_desc_t *, + ur_usm_pool_handle_t, size_t size, void **ppMem) { + auto Res = olMemAlloc(reinterpret_cast(hContext->Device), + OL_ALLOC_TYPE_MANAGED, size, ppMem); + + if (Res != OL_SUCCESS) { + return offloadResultToUR(Res); + } + + hContext->AllocTypeMap.insert_or_assign(*ppMem, OL_ALLOC_TYPE_MANAGED); + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urUSMFree(ur_context_handle_t, void *pMem) { + return offloadResultToUR(olMemFree(pMem)); +} From 29e361f04fef8cb85ae426b4256f137a5cf0464d Mon Sep 17 00:00:00 2001 From: Ross Brunton Date: Thu, 24 Apr 2025 12:01:48 +0100 Subject: [PATCH 02/11] Support compiling on non-CUDA This makes the dependency on cudadrv optional and ifdefs away the cubin workaround if it isn't. This isn't sufficient to have HIP devices compile kernels, but does allow libur_adapter_offload to be built on said hosts. In addition, an unused variable error was fixed. --- .../source/adapters/offload/CMakeLists.txt | 10 ++- .../source/adapters/offload/adapter.cpp | 2 + .../source/adapters/offload/program.cpp | 86 ++++++++++++------- .../test/conformance/CMakeLists.txt | 2 +- 4 files changed, 68 insertions(+), 32 deletions(-) diff --git a/unified-runtime/source/adapters/offload/CMakeLists.txt b/unified-runtime/source/adapters/offload/CMakeLists.txt index d3559c2ae7761..6461fabe647b2 100644 --- a/unified-runtime/source/adapters/offload/CMakeLists.txt +++ b/unified-runtime/source/adapters/offload/CMakeLists.txt @@ -14,7 +14,7 @@ endif() # For the PTX workaround we need to link with CUDA. if (NOT TARGET cudadrv) - find_package(CUDA 10.1 REQUIRED) + find_package(CUDA 10.1) add_library(cudadrv SHARED IMPORTED GLOBAL) set_target_properties( cudadrv PROPERTIES @@ -49,9 +49,15 @@ target_link_libraries(${TARGET_NAME} PRIVATE ${PROJECT_NAME}::common ${PROJECT_NAME}::umf ${UR_OFFLOAD_INSTALL_DIR}/lib/libLLVMOffload.so - cudadrv ) +if (CUDA_CUDA_LIBRARY) + target_link_libraries(${TARGET_NAME} + cudadrv + ) + target_compile_definitions(${TARGET_NAME} PRIVATE UR_CUDA_ENABLED=1) +endif() + target_include_directories(${TARGET_NAME} PRIVATE "${UR_OFFLOAD_INCLUDE_DIR}/offload" "${CMAKE_CURRENT_SOURCE_DIR}/../../" diff --git a/unified-runtime/source/adapters/offload/adapter.cpp b/unified-runtime/source/adapters/offload/adapter.cpp index bde5dc8e8e6a5..6c4962a69e748 100644 --- a/unified-runtime/source/adapters/offload/adapter.cpp +++ b/unified-runtime/source/adapters/offload/adapter.cpp @@ -46,6 +46,8 @@ ur_result_t ur_adapter_handle_t_::init() { }, this); + (void)Res; + return UR_RESULT_SUCCESS; } diff --git a/unified-runtime/source/adapters/offload/program.cpp b/unified-runtime/source/adapters/offload/program.cpp index a55644efdd4f7..8067a5d17056a 100644 --- a/unified-runtime/source/adapters/offload/program.cpp +++ b/unified-runtime/source/adapters/offload/program.cpp @@ -1,12 +1,63 @@ #include #include #include -#include #include "context.hpp" #include "program.hpp" #include "ur2offload.hpp" +#ifdef UR_CUDA_ENABLED +#include +#endif + +namespace { +// Workaround for Offload not supporting PTX binaries. Force CUDA programs +// to be linked so they end up as CUBIN. +#ifdef UR_CUDA_ENABLED +ur_result_t ProgramCreateCudaWorkaround(ur_context_handle_t hContext, + const uint8_t *Binary, size_t Length, + ur_program_handle_t *phProgram) { + uint8_t *RealBinary; + size_t RealLength; + CUlinkState State; + cuLinkCreate(0, nullptr, nullptr, &State); + + cuLinkAddData(State, CU_JIT_INPUT_PTX, (char *)(Binary), Length, nullptr, 0, + nullptr, nullptr); + + void *CuBin = nullptr; + size_t CuBinSize = 0; + cuLinkComplete(State, &CuBin, &CuBinSize); + RealBinary = (uint8_t *)CuBin; + RealLength = CuBinSize; + fprintf(stderr, "Performed CUDA bin workaround (size = %lu)\n", RealLength); + + ur_program_handle_t Program = new ur_program_handle_t_(); + auto Res = + olCreateProgram(reinterpret_cast(hContext->Device), + RealBinary, RealLength, &Program->OffloadProgram); + + // Program owns the linked module now + cuLinkDestroy(State); + (void)State; + + if (Res != OL_SUCCESS) { + delete Program; + return offloadResultToUR(Res); + } + + *phProgram = Program; + + return UR_RESULT_SUCCESS; +} +#else +ur_result_t ProgramCreateCudaWorkaround(ur_context_handle_t, const uint8_t *, + size_t, ur_program_handle_t *) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} +#endif +} // namespace + UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary( ur_context_handle_t hContext, uint32_t numDevices, ur_device_handle_t *phDevices, size_t *pLengths, const uint8_t **ppBinaries, @@ -15,45 +66,23 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } - // Workaround for Offload not supporting PTX binaries. Force CUDA programs - // to be linked so they end up as CUBIN. - uint8_t *RealBinary; - size_t RealLength; ur_platform_handle_t DevicePlatform; - bool DidLink = false; - CUlinkState State; urDeviceGetInfo(phDevices[0], UR_DEVICE_INFO_PLATFORM, sizeof(ur_platform_handle_t), &DevicePlatform, nullptr); ur_platform_backend_t PlatformBackend; urPlatformGetInfo(DevicePlatform, UR_PLATFORM_INFO_BACKEND, sizeof(ur_platform_backend_t), &PlatformBackend, nullptr); if (PlatformBackend == UR_PLATFORM_BACKEND_CUDA) { - cuLinkCreate(0, nullptr, nullptr, &State); - - cuLinkAddData(State, CU_JIT_INPUT_PTX, (char *)(ppBinaries[0]), pLengths[0], - nullptr, 0, nullptr, nullptr); - - void *CuBin = nullptr; - size_t CuBinSize = 0; - cuLinkComplete(State, &CuBin, &CuBinSize); - RealBinary = (uint8_t *)CuBin; - RealLength = CuBinSize; - DidLink = true; - fprintf(stderr, "Performed CUDA bin workaround (size = %lu)\n", RealLength); - } else { - RealBinary = const_cast(ppBinaries[0]); - RealLength = pLengths[0]; + return ProgramCreateCudaWorkaround(hContext, ppBinaries[0], pLengths[0], + phProgram); } + auto *RealBinary = const_cast(ppBinaries[0]); + ur_program_handle_t Program = new ur_program_handle_t_(); auto Res = olCreateProgram(reinterpret_cast(hContext->Device), - RealBinary, RealLength, &Program->OffloadProgram); - - // Program owns the linked module now - if (DidLink) { - cuLinkDestroy(State); - } + RealBinary, pLengths[0], &Program->OffloadProgram); if (Res != OL_SUCCESS) { delete Program; @@ -80,7 +109,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramBuildExp(ur_program_handle_t, return UR_RESULT_SUCCESS; } - UR_APIEXPORT ur_result_t UR_APICALL urProgramRetain(ur_program_handle_t hProgram) { hProgram->RefCount++; diff --git a/unified-runtime/test/conformance/CMakeLists.txt b/unified-runtime/test/conformance/CMakeLists.txt index 9e29e727e2e2a..73f7f10aaabf6 100644 --- a/unified-runtime/test/conformance/CMakeLists.txt +++ b/unified-runtime/test/conformance/CMakeLists.txt @@ -131,7 +131,7 @@ if(UR_FOUND_DPCXX) if(UR_BUILD_ADAPTER_CUDA OR UR_BUILD_ADAPTER_ALL) list(APPEND TARGET_TRIPLES "nvptx64-nvidia-cuda") endif() - if(UR_BUILD_ADAPTER_HIP OR UR_BUILD_ADAPTER_ALL) + if(UR_BUILD_ADAPTER_HIP OR UR_BUILD_ADAPTER_OFFLOAD OR UR_BUILD_ADAPTER_ALL) list(APPEND TARGET_TRIPLES "amdgcn-amd-amdhsa") endif() else() From fee5bd1ea7f48bd76006bb57de9442597d6a0910 Mon Sep 17 00:00:00 2001 From: Ross Brunton Date: Thu, 24 Apr 2025 12:49:35 +0100 Subject: [PATCH 03/11] Add "offload" as an adapter to the various registries This adds "offload" to several locations, meaning that: * It will be initialised by the loader and iterated like other adapters. * ONEAPI_DEVICE_SELECTOR="offload:*" works (note that this is an extension to the ONEAPI_DEVICE_SELECTOR format). * Platforms and adapters now report themselves as "OFFLOAD" rather than "CUDA" or "HIP". --- unified-runtime/include/ur_api.h | 4 ++++ unified-runtime/include/ur_print.hpp | 6 ++++++ unified-runtime/scripts/core/adapter.yml | 3 +++ unified-runtime/scripts/core/manifests.yml | 7 +++++++ unified-runtime/scripts/core/platform.yml | 3 +++ .../source/adapters/CMakeLists.txt | 1 + .../source/adapters/offload/adapter.cpp | 2 +- .../source/adapters/offload/platform.cpp | 20 +------------------ .../source/loader/ur_adapter_registry.hpp | 1 + unified-runtime/source/loader/ur_lib.cpp | 6 ++++-- .../source/loader/ur_manifests.hpp | 7 +++++++ 11 files changed, 38 insertions(+), 22 deletions(-) diff --git a/unified-runtime/include/ur_api.h b/unified-runtime/include/ur_api.h index dcf05b2b066c7..36fe8ac06bf94 100644 --- a/unified-runtime/include/ur_api.h +++ b/unified-runtime/include/ur_api.h @@ -1422,6 +1422,8 @@ typedef enum ur_adapter_backend_t { UR_ADAPTER_BACKEND_HIP = 4, /// The backend is Native CPU UR_ADAPTER_BACKEND_NATIVE_CPU = 5, + /// The backend is liboffload + UR_ADAPTER_BACKEND_OFFLOAD = 0x100, /// @cond UR_ADAPTER_BACKEND_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -1801,6 +1803,8 @@ typedef enum ur_platform_backend_t { UR_PLATFORM_BACKEND_HIP = 4, /// The backend is Native CPU UR_PLATFORM_BACKEND_NATIVE_CPU = 5, + /// The backend is liboffload + UR_PLATFORM_BACKEND_OFFLOAD = 0x100, /// @cond UR_PLATFORM_BACKEND_FORCE_UINT32 = 0x7fffffff /// @endcond diff --git a/unified-runtime/include/ur_print.hpp b/unified-runtime/include/ur_print.hpp index c5333f76f478e..63656894ae0e9 100644 --- a/unified-runtime/include/ur_print.hpp +++ b/unified-runtime/include/ur_print.hpp @@ -2356,6 +2356,9 @@ inline std::ostream &operator<<(std::ostream &os, case UR_ADAPTER_BACKEND_NATIVE_CPU: os << "UR_ADAPTER_BACKEND_NATIVE_CPU"; break; + case UR_ADAPTER_BACKEND_OFFLOAD: + os << "UR_ADAPTER_BACKEND_OFFLOAD"; + break; default: os << "unknown enumerator"; break; @@ -2553,6 +2556,9 @@ inline std::ostream &operator<<(std::ostream &os, case UR_PLATFORM_BACKEND_NATIVE_CPU: os << "UR_PLATFORM_BACKEND_NATIVE_CPU"; break; + case UR_PLATFORM_BACKEND_OFFLOAD: + os << "UR_PLATFORM_BACKEND_OFFLOAD"; + break; default: os << "unknown enumerator"; break; diff --git a/unified-runtime/scripts/core/adapter.yml b/unified-runtime/scripts/core/adapter.yml index d806d48974a4f..8253104386b05 100644 --- a/unified-runtime/scripts/core/adapter.yml +++ b/unified-runtime/scripts/core/adapter.yml @@ -209,6 +209,9 @@ etors: - name: NATIVE_CPU value: "5" desc: "The backend is Native CPU" + - name: OFFLOAD + value: "0x100" + desc: "The backend is liboffload" --- #-------------------------------------------------------------------------- type: enum desc: "Minimum level of messages to be processed by the logger." diff --git a/unified-runtime/scripts/core/manifests.yml b/unified-runtime/scripts/core/manifests.yml index 6b9647852daea..da58ebb57df34 100644 --- a/unified-runtime/scripts/core/manifests.yml +++ b/unified-runtime/scripts/core/manifests.yml @@ -61,3 +61,10 @@ name: native_cpu backend: $X_ADAPTER_BACKEND_NATIVE_CPU device_types: - $X_DEVICE_TYPE_CPU +--- #-------------------------------------------------------------------------- +type: manifest +name: offload +backend: $X_ADAPTER_BACKEND_OFFLOAD +device_types: + - $X_DEVICE_TYPE_CPU + - $X_DEVICE_TYPE_GPU diff --git a/unified-runtime/scripts/core/platform.yml b/unified-runtime/scripts/core/platform.yml index 7d4edf5c0b5c0..84c7a99d6e833 100644 --- a/unified-runtime/scripts/core/platform.yml +++ b/unified-runtime/scripts/core/platform.yml @@ -279,3 +279,6 @@ etors: - name: NATIVE_CPU value: "5" desc: "The backend is Native CPU" + - name: OFFLOAD + value: "0x100" + desc: "The backend is liboffload" diff --git a/unified-runtime/source/adapters/CMakeLists.txt b/unified-runtime/source/adapters/CMakeLists.txt index 34ba19b6e8859..8c357caa21946 100644 --- a/unified-runtime/source/adapters/CMakeLists.txt +++ b/unified-runtime/source/adapters/CMakeLists.txt @@ -76,6 +76,7 @@ endif() if(UR_BUILD_ADAPTER_OFFLOAD) add_ur_adapter_subdirectory(offload) + list(APPEND TEMP_LIST "offload") endif() set(UR_ADAPTERS_LIST "${TEMP_LIST}" CACHE STRING "" FORCE) diff --git a/unified-runtime/source/adapters/offload/adapter.cpp b/unified-runtime/source/adapters/offload/adapter.cpp index 6c4962a69e748..6299bd3280de1 100644 --- a/unified-runtime/source/adapters/offload/adapter.cpp +++ b/unified-runtime/source/adapters/offload/adapter.cpp @@ -87,7 +87,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urAdapterGetInfo(ur_adapter_handle_t, switch (propName) { case UR_ADAPTER_INFO_BACKEND: - return ReturnValue(UR_ADAPTER_BACKEND_CUDA); // TODO: Return a proper value + return ReturnValue(UR_ADAPTER_BACKEND_OFFLOAD); case UR_ADAPTER_INFO_REFERENCE_COUNT: return ReturnValue(Adapter.RefCount.load()); default: diff --git a/unified-runtime/source/adapters/offload/platform.cpp b/unified-runtime/source/adapters/offload/platform.cpp index 50b9ac90649ce..02b992c96351d 100644 --- a/unified-runtime/source/adapters/offload/platform.cpp +++ b/unified-runtime/source/adapters/offload/platform.cpp @@ -49,7 +49,7 @@ urPlatformGetInfo(ur_platform_handle_t hPlatform, ur_platform_info_t propName, case UR_PLATFORM_INFO_PROFILE: return ReturnValue("FULL_PROFILE"); case UR_PLATFORM_INFO_BACKEND: - olInfo = OL_PLATFORM_INFO_BACKEND; + return ReturnValue(UR_PLATFORM_BACKEND_OFFLOAD); break; default: return UR_RESULT_ERROR_INVALID_ENUMERATION; @@ -69,24 +69,6 @@ urPlatformGetInfo(ur_platform_handle_t hPlatform, ur_platform_info_t propName, olInfo, propSize, pPropValue)) { return offloadResultToUR(Res); } - - // Need to explicitly map this type - if (olInfo == OL_PLATFORM_INFO_BACKEND) { - auto urPropPtr = reinterpret_cast(pPropValue); - auto olPropPtr = reinterpret_cast(pPropValue); - - switch (*olPropPtr) { - case OL_PLATFORM_BACKEND_CUDA: - *urPropPtr = UR_PLATFORM_BACKEND_CUDA; - break; - case OL_PLATFORM_BACKEND_AMDGPU: - *urPropPtr = UR_PLATFORM_BACKEND_HIP; - break; - default: - *urPropPtr = UR_PLATFORM_BACKEND_UNKNOWN; - break; - } - } } return UR_RESULT_SUCCESS; diff --git a/unified-runtime/source/loader/ur_adapter_registry.hpp b/unified-runtime/source/loader/ur_adapter_registry.hpp index 53fed55ea0c8a..36e5e9a602756 100644 --- a/unified-runtime/source/loader/ur_adapter_registry.hpp +++ b/unified-runtime/source/loader/ur_adapter_registry.hpp @@ -40,6 +40,7 @@ struct FilterTerm { {"cuda", UR_ADAPTER_BACKEND_CUDA}, {"hip", UR_ADAPTER_BACKEND_HIP}, {"native_cpu", UR_ADAPTER_BACKEND_NATIVE_CPU}, + {"offload", UR_ADAPTER_BACKEND_OFFLOAD}, }; bool matchesBackend(const ur_adapter_backend_t &match_backend) const { diff --git a/unified-runtime/source/loader/ur_lib.cpp b/unified-runtime/source/loader/ur_lib.cpp index 8163be0fbaebf..c224ca00b1777 100644 --- a/unified-runtime/source/loader/ur_lib.cpp +++ b/unified-runtime/source/loader/ur_lib.cpp @@ -251,13 +251,15 @@ ur_result_t urDeviceGetSelected(ur_platform_handle_t hPlatform, uint32_t NumEntries, ur_device_handle_t *phDevices, uint32_t *pNumDevices) { - constexpr std::pair adapters[6] = { + constexpr std::pair adapters[7] = { {UR_PLATFORM_BACKEND_UNKNOWN, "*"}, {UR_PLATFORM_BACKEND_LEVEL_ZERO, "level_zero"}, {UR_PLATFORM_BACKEND_OPENCL, "opencl"}, {UR_PLATFORM_BACKEND_CUDA, "cuda"}, {UR_PLATFORM_BACKEND_HIP, "hip"}, - {UR_PLATFORM_BACKEND_NATIVE_CPU, "native_cpu"}}; + {UR_PLATFORM_BACKEND_NATIVE_CPU, "native_cpu"}, + {UR_PLATFORM_BACKEND_OFFLOAD, "offload"}, + }; if (!hPlatform) { return UR_RESULT_ERROR_INVALID_NULL_HANDLE; diff --git a/unified-runtime/source/loader/ur_manifests.hpp b/unified-runtime/source/loader/ur_manifests.hpp index 2ed89fc79f4f0..9981d8d1fa5bf 100644 --- a/unified-runtime/source/loader/ur_manifests.hpp +++ b/unified-runtime/source/loader/ur_manifests.hpp @@ -79,5 +79,12 @@ const std::vector ur_adapter_manifests = { { UR_DEVICE_TYPE_CPU, }}, + {"offload", + MAKE_LIBRARY_NAME("ur_adapter_offload", "0"), + UR_ADAPTER_BACKEND_OFFLOAD, + { + UR_DEVICE_TYPE_CPU, + UR_DEVICE_TYPE_GPU, + }}, }; } // namespace ur_loader From e1c2d4fcfe3d90c793f768d53b4935425de5b979 Mon Sep 17 00:00:00 2001 From: Ross Brunton Date: Thu, 24 Apr 2025 14:25:26 +0100 Subject: [PATCH 04/11] Quick CTS running for Offload It is non-trivial which binary format (spir, ptx, amdhsa) is accepted by a given offload device. This should be fixed properly in the future, but for now let the user specify it via an environment variable. --- unified-runtime/test/conformance/CMakeLists.txt | 4 ++-- .../test/conformance/platform/urPlatformGetInfo.cpp | 2 +- .../test/conformance/source/environment.cpp | 13 ++++++++++++- 3 files changed, 15 insertions(+), 4 deletions(-) diff --git a/unified-runtime/test/conformance/CMakeLists.txt b/unified-runtime/test/conformance/CMakeLists.txt index 73f7f10aaabf6..182b80affb423 100644 --- a/unified-runtime/test/conformance/CMakeLists.txt +++ b/unified-runtime/test/conformance/CMakeLists.txt @@ -125,10 +125,10 @@ if(UR_FOUND_DPCXX) file(MAKE_DIRECTORY ${UR_CONFORMANCE_DEVICE_BINARIES_DIR}) if("${UR_CONFORMANCE_TARGET_TRIPLES}" STREQUAL "") - if(UR_BUILD_ADAPTER_L0 OR UR_BUILD_ADAPTER_L0_V2 OR UR_BUILD_ADAPTER_OPENCL OR UR_BUILD_ADAPTER_ALL) + if(UR_BUILD_ADAPTER_L0 OR UR_BUILD_ADAPTER_OFFLOAD OR UR_BUILD_ADAPTER_L0_V2 OR UR_BUILD_ADAPTER_OPENCL OR UR_BUILD_ADAPTER_ALL) list(APPEND TARGET_TRIPLES "spir64") endif() - if(UR_BUILD_ADAPTER_CUDA OR UR_BUILD_ADAPTER_ALL) + if(UR_BUILD_ADAPTER_CUDA OR UR_BUILD_ADAPTER_OFFLOAD OR UR_BUILD_ADAPTER_ALL) list(APPEND TARGET_TRIPLES "nvptx64-nvidia-cuda") endif() if(UR_BUILD_ADAPTER_HIP OR UR_BUILD_ADAPTER_OFFLOAD OR UR_BUILD_ADAPTER_ALL) diff --git a/unified-runtime/test/conformance/platform/urPlatformGetInfo.cpp b/unified-runtime/test/conformance/platform/urPlatformGetInfo.cpp index 71a55c93292b2..06b269f54ae2d 100644 --- a/unified-runtime/test/conformance/platform/urPlatformGetInfo.cpp +++ b/unified-runtime/test/conformance/platform/urPlatformGetInfo.cpp @@ -106,7 +106,7 @@ TEST_P(urPlatformGetInfoTest, SuccessBackend) { &property_value, nullptr)); ASSERT_TRUE(property_value >= UR_PLATFORM_BACKEND_LEVEL_ZERO && - property_value <= UR_PLATFORM_BACKEND_NATIVE_CPU); + property_value <= UR_PLATFORM_BACKEND_OFFLOAD); } TEST_P(urPlatformGetInfoTest, SuccessAdapter) { diff --git a/unified-runtime/test/conformance/source/environment.cpp b/unified-runtime/test/conformance/source/environment.cpp index d1fe951e5ce00..d2f63c37929d8 100644 --- a/unified-runtime/test/conformance/source/environment.cpp +++ b/unified-runtime/test/conformance/source/environment.cpp @@ -215,6 +215,16 @@ std::string KernelsEnvironment::getTargetName(ur_platform_handle_t platform) { return "nvptx64-nvidia-cuda"; case UR_PLATFORM_BACKEND_HIP: return "amdgcn-amd-amdhsa"; + case UR_PLATFORM_BACKEND_OFFLOAD: { + // TODO: In future this should use urDeviceSelectBinary + auto result = ur_getenv("UR_OFFLOAD_TARGET_NAME"); + if (!result) { + error = "For offload testing, please specify a target in " + "`UR_OFFLOAD_TARGET_NAME`"; + return {}; + } + return *result; + } case UR_PLATFORM_BACKEND_NATIVE_CPU: error = "native_cpu doesn't support kernel tests yet"; return {}; @@ -297,7 +307,8 @@ void KernelsEnvironment::CreateProgram( sizeof(ur_platform_backend_t), &backend, nullptr)); if (backend == UR_PLATFORM_BACKEND_HIP || - backend == UR_PLATFORM_BACKEND_CUDA) { + backend == UR_PLATFORM_BACKEND_CUDA || + backend == UR_PLATFORM_BACKEND_OFFLOAD) { // The CUDA and HIP adapters do not support urProgramCreateWithIL so we // need to use urProgramCreateWithBinary instead. auto size = binary.size(); From e5de2c4d9c84724d4d2f3ffef0ba8295d8fb85a0 Mon Sep 17 00:00:00 2001 From: Ross Brunton Date: Tue, 29 Apr 2025 10:24:36 +0100 Subject: [PATCH 05/11] Parse HIP "offload" bundles SYCL and the UR CTS produce HIP "offload bundles" when compiling for AMDGPU, which cannot be accepted by the basic AMD offload plugin. This change adds a simple offload bundle parser which extracts the appropriate binary from the bundle, allowing it to be fed to liboffload. --- .../source/adapters/offload/program.cpp | 105 +++++++++++++++++- 1 file changed, 101 insertions(+), 4 deletions(-) diff --git a/unified-runtime/source/adapters/offload/program.cpp b/unified-runtime/source/adapters/offload/program.cpp index 8067a5d17056a..acafef8b73d66 100644 --- a/unified-runtime/source/adapters/offload/program.cpp +++ b/unified-runtime/source/adapters/offload/program.cpp @@ -56,6 +56,86 @@ ur_result_t ProgramCreateCudaWorkaround(ur_context_handle_t, const uint8_t *, return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } #endif + +// https://clang.llvm.org/docs/ClangOffloadBundler.html#bundled-binary-file-layout +class HipOffloadBundleParser { + static constexpr std::string_view Magic = "__CLANG_OFFLOAD_BUNDLE__"; + const uint8_t *Buff; + size_t Length; + + struct __attribute__((packed)) BundleEntry { + uint64_t ObjectOffset; + uint64_t ObjectSize; + uint64_t EntryIdSize; + char EntryIdStart; + }; + + struct __attribute__((packed)) BundleHeader { + const char HeaderMagic[Magic.size()]; + uint64_t EntryCount; + BundleEntry FirstEntry; + }; + + HipOffloadBundleParser() = delete; + HipOffloadBundleParser(const uint8_t *Buff, size_t Length) + : Buff(Buff), Length(Length) {} + +public: + static std::optional load(const uint8_t *Buff, + size_t Length) { + if (std::string_view{reinterpret_cast(Buff), Length}.find( + Magic) != 0) { + return std::nullopt; + } + return HipOffloadBundleParser(Buff, Length); + } + + ur_result_t extract(std::string_view SearchTargetId, + const uint8_t *&OutBinary, size_t &OutLength) { + const char *Limit = reinterpret_cast(&Buff[Length]); + + // The different check here means that a binary consisting of only the magic + // bytes (but nothing else) will result in INVALID_PROGRAM rather than being + // treated as a non-bundle + auto *Header = reinterpret_cast(Buff); + if (reinterpret_cast(&Header->FirstEntry) > Limit) { + return UR_RESULT_ERROR_INVALID_PROGRAM; + } + + const auto *CurrentEntry = &Header->FirstEntry; + for (uint64_t I = 0; I < Header->EntryCount; I++) { + if (&CurrentEntry->EntryIdStart > Limit) { + return UR_RESULT_ERROR_INVALID_PROGRAM; + } + auto EntryId = std::string_view(&CurrentEntry->EntryIdStart, + CurrentEntry->EntryIdSize); + if (EntryId.end() > Limit) { + return UR_RESULT_ERROR_INVALID_PROGRAM; + } + + // Will match either "hip" or "hipv4" + bool isHip = EntryId.find("hip") == 0; + bool VersionMatches = + EntryId.find_last_of(SearchTargetId) == EntryId.size() - 1; + + if (isHip && VersionMatches) { + OutBinary = reinterpret_cast( + &Buff[CurrentEntry->ObjectOffset]); + OutLength = CurrentEntry->ObjectSize; + + if (reinterpret_cast(&OutBinary[OutLength]) > Limit) { + return UR_RESULT_ERROR_INVALID_PROGRAM; + } + return UR_RESULT_SUCCESS; + } + + CurrentEntry = reinterpret_cast(EntryId.end()); + } + + return UR_RESULT_ERROR_INVALID_PROGRAM; + } +}; + } // namespace UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary( @@ -72,17 +152,34 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary( ur_platform_backend_t PlatformBackend; urPlatformGetInfo(DevicePlatform, UR_PLATFORM_INFO_BACKEND, sizeof(ur_platform_backend_t), &PlatformBackend, nullptr); + + auto *RealBinary = ppBinaries[0]; + size_t RealLength = pLengths[0]; + + if (auto Parser = HipOffloadBundleParser::load(RealBinary, RealLength)) { + std::string DevName{}; + size_t DevNameLength; + olGetDeviceInfoSize(reinterpret_cast(phDevices[0]), + OL_DEVICE_INFO_NAME, &DevNameLength); + DevName.resize(DevNameLength); + olGetDeviceInfo(reinterpret_cast(phDevices[0]), + OL_DEVICE_INFO_NAME, DevNameLength, DevName.data()); + + auto Res = Parser->extract(DevName, RealBinary, RealLength); + if (Res != UR_RESULT_SUCCESS) { + return Res; + } + } + if (PlatformBackend == UR_PLATFORM_BACKEND_CUDA) { - return ProgramCreateCudaWorkaround(hContext, ppBinaries[0], pLengths[0], + return ProgramCreateCudaWorkaround(hContext, RealBinary, RealLength, phProgram); } - auto *RealBinary = const_cast(ppBinaries[0]); - ur_program_handle_t Program = new ur_program_handle_t_(); auto Res = olCreateProgram(reinterpret_cast(hContext->Device), - RealBinary, pLengths[0], &Program->OffloadProgram); + RealBinary, RealLength, &Program->OffloadProgram); if (Res != OL_SUCCESS) { delete Program; From d2c52a46213af0a87fcc4720cfce42b9f84f8f9d Mon Sep 17 00:00:00 2001 From: Ross Brunton Date: Tue, 29 Apr 2025 12:25:57 +0100 Subject: [PATCH 06/11] Use proper handle types --- .../source/adapters/offload/adapter.cpp | 32 ++++++++---- .../source/adapters/offload/adapter.hpp | 6 ++- .../source/adapters/offload/common.hpp | 6 ++- .../source/adapters/offload/context.hpp | 13 ++--- .../source/adapters/offload/device.cpp | 49 ++++++------------- .../source/adapters/offload/device.hpp | 14 ++++++ .../source/adapters/offload/platform.cpp | 14 +++--- .../source/adapters/offload/platform.hpp | 13 +++++ .../source/adapters/offload/program.cpp | 11 ++--- .../source/adapters/offload/queue.cpp | 6 +-- .../source/adapters/offload/usm.cpp | 13 ++--- 11 files changed, 98 insertions(+), 79 deletions(-) create mode 100644 unified-runtime/source/adapters/offload/device.hpp create mode 100644 unified-runtime/source/adapters/offload/platform.hpp diff --git a/unified-runtime/source/adapters/offload/adapter.cpp b/unified-runtime/source/adapters/offload/adapter.cpp index 6299bd3280de1..9371e6684ca9e 100644 --- a/unified-runtime/source/adapters/offload/adapter.cpp +++ b/unified-runtime/source/adapters/offload/adapter.cpp @@ -8,12 +8,14 @@ // //===----------------------------------------------------------------------===// +#include #include #include -#include #include #include "adapter.hpp" +#include "device.hpp" +#include "platform.hpp" #include "ur/ur.hpp" #include "ur_api.h" @@ -23,14 +25,15 @@ ur_adapter_handle_t_ Adapter{}; ur_result_t ur_adapter_handle_t_::init() { auto Res = olInit(); - // Discover every platform that isn't the host platform. - // Use an unordered_set to deduplicate platforms we discover multiple times - // from different devices. - // Also discover the host device. We only expect one so don't need to worry - // about overwriting it. + struct InitUserData { + std::unordered_map TempMap; + } InitUserData{{}}; + + // Discover every platform and device Res = olIterateDevices( [](ol_device_handle_t D, void *UserData) { - auto Adapter = static_cast(UserData); + auto *Data = reinterpret_cast(UserData); + ol_platform_handle_t Platform; olGetDeviceInfo(D, OL_DEVICE_INFO_PLATFORM, sizeof(Platform), &Platform); @@ -38,13 +41,22 @@ ur_result_t ur_adapter_handle_t_::init() { olGetPlatformInfo(Platform, OL_PLATFORM_INFO_BACKEND, sizeof(Backend), &Backend); if (Backend == OL_PLATFORM_BACKEND_HOST) { - Adapter->HostDevice = D; + Adapter.HostDevice = D; } else if (Backend != OL_PLATFORM_BACKEND_UNKNOWN) { - Adapter->Platforms.insert(Platform); + ur_platform_handle_t UrPlatform; + if (!Data->TempMap.count(Platform)) { + Adapter.Platforms.push_back(ur_platform_handle_t_{Platform}); + UrPlatform = &Adapter.Platforms.back(); + Data->TempMap.insert({Platform, UrPlatform}); + } else { + UrPlatform = Data->TempMap[Platform]; + } + + UrPlatform->Devices.push_back(ur_device_handle_t_{UrPlatform, D}); } return false; }, - this); + &InitUserData); (void)Res; diff --git a/unified-runtime/source/adapters/offload/adapter.hpp b/unified-runtime/source/adapters/offload/adapter.hpp index be36aceaa8410..a9fd927b55785 100644 --- a/unified-runtime/source/adapters/offload/adapter.hpp +++ b/unified-runtime/source/adapters/offload/adapter.hpp @@ -6,13 +6,15 @@ #include +#include "common.hpp" #include "logger/ur_logger.hpp" +#include "platform.hpp" -struct ur_adapter_handle_t_ { +struct ur_adapter_handle_t_ : ur::offload::handle_base { std::atomic_uint32_t RefCount = 0; logger::Logger &Logger = logger::get_logger("offload"); ol_device_handle_t HostDevice = nullptr; - std::unordered_set Platforms; + std::vector Platforms; ur_result_t init(); }; diff --git a/unified-runtime/source/adapters/offload/common.hpp b/unified-runtime/source/adapters/offload/common.hpp index 69aa6bff11e9f..152714bdc2cc5 100644 --- a/unified-runtime/source/adapters/offload/common.hpp +++ b/unified-runtime/source/adapters/offload/common.hpp @@ -2,6 +2,10 @@ #include -struct RefCounted { +namespace ur::offload { +struct handle_base {}; +} // namespace ur::offload + +struct RefCounted : ur::offload::handle_base { std::atomic_uint32_t RefCount = 1; }; diff --git a/unified-runtime/source/adapters/offload/context.hpp b/unified-runtime/source/adapters/offload/context.hpp index 9483ec1b4a8b8..ce43d428cd0c7 100644 --- a/unified-runtime/source/adapters/offload/context.hpp +++ b/unified-runtime/source/adapters/offload/context.hpp @@ -1,19 +1,16 @@ #pragma once -#include +#include "common.hpp" +#include #include #include -#include -struct ur_context_handle_t_ { +struct ur_context_handle_t_ : RefCounted { ur_context_handle_t_(ur_device_handle_t hDevice) : Device{hDevice} { urDeviceRetain(Device); } - ~ur_context_handle_t_() { - urDeviceRelease(Device); - } + ~ur_context_handle_t_() { urDeviceRelease(Device); } ur_device_handle_t Device; - std::atomic_uint32_t RefCount; - std::unordered_map AllocTypeMap; + std::unordered_map AllocTypeMap; }; diff --git a/unified-runtime/source/adapters/offload/device.cpp b/unified-runtime/source/adapters/offload/device.cpp index 54cc60c6fe110..d67f3555cf640 100644 --- a/unified-runtime/source/adapters/offload/device.cpp +++ b/unified-runtime/source/adapters/offload/device.cpp @@ -2,6 +2,8 @@ #include #include +#include "device.hpp" +#include "platform.hpp" #include "ur2offload.hpp" UR_APIEXPORT ur_result_t UR_APICALL urDeviceGet(ur_platform_handle_t hPlatform, @@ -9,38 +11,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGet(ur_platform_handle_t hPlatform, uint32_t NumEntries, ur_device_handle_t *phDevices, uint32_t *pNumDevices) { + if (pNumDevices) { + *pNumDevices = static_cast(hPlatform->Devices.size()); + } - uint32_t NumDevices = 0; - // Pass a few things to the callback (we can't use a lambda with captures) - using ParamsT = struct { - uint32_t DeviceLimit; - uint32_t &NumDevices; - ol_platform_handle_t Platform; - ol_device_handle_t *DevicesOut; - }; - ParamsT Params = {NumEntries, NumDevices, - reinterpret_cast(hPlatform), - reinterpret_cast(phDevices)}; - - olIterateDevices( - [](ol_device_handle_t D, void *Data) { - auto Params = reinterpret_cast(Data); - ol_platform_handle_t Platform = nullptr; - olGetDeviceInfo(D, OL_DEVICE_INFO_PLATFORM, sizeof(Platform), - &Platform); - if (Platform == Params->Platform) { - if (Params->DevicesOut) { - Params->DevicesOut[Params->NumDevices] = D; - } - Params->NumDevices++; - } - return Params->NumDevices == Params->DeviceLimit; - }, - &Params); + size_t NumDevices = + std::min(static_cast(hPlatform->Devices.size()), NumEntries); - if (pNumDevices) { - *pNumDevices = NumDevices; + for (size_t I = 0; I < NumDevices; I++) { + phDevices[I] = &hPlatform->Devices[I]; } + return UR_RESULT_SUCCESS; } @@ -74,7 +55,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, olInfo = OL_DEVICE_INFO_DRIVER_VERSION; break; case UR_DEVICE_INFO_PLATFORM: - olInfo = OL_DEVICE_INFO_PLATFORM; + return ReturnValue(hDevice->Platform); break; case UR_DEVICE_INFO_USM_SINGLE_SHARED_SUPPORT: return ReturnValue(UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS); @@ -86,16 +67,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, if (pPropSizeRet) { if (auto Res = - olGetDeviceInfoSize(reinterpret_cast(hDevice), - olInfo, pPropSizeRet)) { + olGetDeviceInfoSize(hDevice->OffloadDevice, olInfo, pPropSizeRet)) { return offloadResultToUR(Res); } } if (pPropValue) { - if (auto Res = - olGetDeviceInfo(reinterpret_cast(hDevice), - olInfo, propSize, pPropValue)) { + if (auto Res = olGetDeviceInfo(hDevice->OffloadDevice, olInfo, propSize, + pPropValue)) { return offloadResultToUR(Res); } // Need to explicitly map this type diff --git a/unified-runtime/source/adapters/offload/device.hpp b/unified-runtime/source/adapters/offload/device.hpp new file mode 100644 index 0000000000000..b1fc24792f3b4 --- /dev/null +++ b/unified-runtime/source/adapters/offload/device.hpp @@ -0,0 +1,14 @@ +#pragma once + +#include "common.hpp" +#include +#include + +struct ur_device_handle_t_ : ur::offload::handle_base { + ur_device_handle_t_(ur_platform_handle_t Platform, + ol_device_handle_t OffloadDevice) + : handle_base(), Platform(Platform), OffloadDevice(OffloadDevice) {} + + ur_platform_handle_t Platform; + ol_device_handle_t OffloadDevice; +}; diff --git a/unified-runtime/source/adapters/offload/platform.cpp b/unified-runtime/source/adapters/offload/platform.cpp index 02b992c96351d..133a653baa7ca 100644 --- a/unified-runtime/source/adapters/offload/platform.cpp +++ b/unified-runtime/source/adapters/offload/platform.cpp @@ -4,6 +4,7 @@ #include #include "adapter.hpp" +#include "device.hpp" #include "ur2offload.hpp" UR_APIEXPORT ur_result_t UR_APICALL @@ -17,8 +18,7 @@ urPlatformGet(ur_adapter_handle_t, uint32_t NumEntries, if (phPlatforms) { size_t PlatformIndex = 0; for (auto &Platform : Adapter.Platforms) { - phPlatforms[PlatformIndex++] = - reinterpret_cast(Platform); + phPlatforms[PlatformIndex++] = &Platform; if (PlatformIndex == NumEntries) { break; } @@ -56,17 +56,15 @@ urPlatformGetInfo(ur_platform_handle_t hPlatform, ur_platform_info_t propName, } if (pPropSizeRet) { - if (auto Res = olGetPlatformInfoSize( - reinterpret_cast(hPlatform), olInfo, - pPropSizeRet)) { + if (auto Res = olGetPlatformInfoSize(hPlatform->OffloadPlatform, olInfo, + pPropSizeRet)) { return offloadResultToUR(Res); } } if (pPropValue) { - if (auto Res = - olGetPlatformInfo(reinterpret_cast(hPlatform), - olInfo, propSize, pPropValue)) { + if (auto Res = olGetPlatformInfo(hPlatform->OffloadPlatform, olInfo, + propSize, pPropValue)) { return offloadResultToUR(Res); } } diff --git a/unified-runtime/source/adapters/offload/platform.hpp b/unified-runtime/source/adapters/offload/platform.hpp new file mode 100644 index 0000000000000..100e103998364 --- /dev/null +++ b/unified-runtime/source/adapters/offload/platform.hpp @@ -0,0 +1,13 @@ +#pragma once + +#include "common.hpp" +#include +#include +#include + +struct ur_platform_handle_t_ : ur::offload::handle_base { + ur_platform_handle_t_(ol_platform_handle_t OffloadPlatform) : handle_base(), OffloadPlatform(OffloadPlatform) {}; + + ol_platform_handle_t OffloadPlatform; + std::vector Devices; +}; diff --git a/unified-runtime/source/adapters/offload/program.cpp b/unified-runtime/source/adapters/offload/program.cpp index acafef8b73d66..5425cbed42095 100644 --- a/unified-runtime/source/adapters/offload/program.cpp +++ b/unified-runtime/source/adapters/offload/program.cpp @@ -3,6 +3,7 @@ #include #include "context.hpp" +#include "device.hpp" #include "program.hpp" #include "ur2offload.hpp" @@ -33,9 +34,8 @@ ur_result_t ProgramCreateCudaWorkaround(ur_context_handle_t hContext, fprintf(stderr, "Performed CUDA bin workaround (size = %lu)\n", RealLength); ur_program_handle_t Program = new ur_program_handle_t_(); - auto Res = - olCreateProgram(reinterpret_cast(hContext->Device), - RealBinary, RealLength, &Program->OffloadProgram); + auto Res = olCreateProgram(hContext->Device->OffloadDevice, RealBinary, + RealLength, &Program->OffloadProgram); // Program owns the linked module now cuLinkDestroy(State); @@ -177,9 +177,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary( } ur_program_handle_t Program = new ur_program_handle_t_(); - auto Res = - olCreateProgram(reinterpret_cast(hContext->Device), - RealBinary, RealLength, &Program->OffloadProgram); + auto Res = olCreateProgram(hContext->Device->OffloadDevice, RealBinary, + RealLength, &Program->OffloadProgram); if (Res != OL_SUCCESS) { delete Program; diff --git a/unified-runtime/source/adapters/offload/queue.cpp b/unified-runtime/source/adapters/offload/queue.cpp index 2900deab68c60..32adb1f512c11 100644 --- a/unified-runtime/source/adapters/offload/queue.cpp +++ b/unified-runtime/source/adapters/offload/queue.cpp @@ -3,6 +3,7 @@ #include #include "context.hpp" +#include "device.hpp" #include "queue.hpp" #include "ur2offload.hpp" @@ -13,14 +14,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueCreate( assert(hContext->Device == hDevice); ur_queue_handle_t Queue = new ur_queue_handle_t_(); - auto Res = olCreateQueue(reinterpret_cast(hDevice), - &Queue->OffloadQueue); + auto Res = olCreateQueue(hDevice->OffloadDevice, &Queue->OffloadQueue); if (Res != OL_SUCCESS) { delete Queue; return offloadResultToUR(Res); } - Queue->OffloadDevice = reinterpret_cast(hDevice); + Queue->OffloadDevice = hDevice->OffloadDevice; *phQueue = Queue; diff --git a/unified-runtime/source/adapters/offload/usm.cpp b/unified-runtime/source/adapters/offload/usm.cpp index a597cf87c21b1..f603516811f0a 100644 --- a/unified-runtime/source/adapters/offload/usm.cpp +++ b/unified-runtime/source/adapters/offload/usm.cpp @@ -3,14 +3,15 @@ #include #include "context.hpp" +#include "device.hpp" #include "ur2offload.hpp" UR_APIEXPORT ur_result_t UR_APICALL urUSMHostAlloc(ur_context_handle_t hContext, const ur_usm_desc_t *, ur_usm_pool_handle_t, size_t size, void **ppMem) { - auto Res = olMemAlloc(reinterpret_cast(hContext->Device), - OL_ALLOC_TYPE_HOST, size, ppMem); + auto Res = olMemAlloc(hContext->Device->OffloadDevice, OL_ALLOC_TYPE_HOST, + size, ppMem); if (Res != OL_SUCCESS) { return offloadResultToUR(Res); @@ -23,8 +24,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMHostAlloc(ur_context_handle_t hContext, UR_APIEXPORT ur_result_t UR_APICALL urUSMDeviceAlloc( ur_context_handle_t hContext, ur_device_handle_t, const ur_usm_desc_t *, ur_usm_pool_handle_t, size_t size, void **ppMem) { - auto Res = olMemAlloc(reinterpret_cast(hContext->Device), - OL_ALLOC_TYPE_DEVICE, size, ppMem); + auto Res = olMemAlloc(hContext->Device->OffloadDevice, OL_ALLOC_TYPE_DEVICE, + size, ppMem); if (Res != OL_SUCCESS) { return offloadResultToUR(Res); @@ -37,8 +38,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMDeviceAlloc( UR_APIEXPORT ur_result_t UR_APICALL urUSMSharedAlloc( ur_context_handle_t hContext, ur_device_handle_t, const ur_usm_desc_t *, ur_usm_pool_handle_t, size_t size, void **ppMem) { - auto Res = olMemAlloc(reinterpret_cast(hContext->Device), - OL_ALLOC_TYPE_MANAGED, size, ppMem); + auto Res = olMemAlloc(hContext->Device->OffloadDevice, OL_ALLOC_TYPE_MANAGED, + size, ppMem); if (Res != OL_SUCCESS) { return offloadResultToUR(Res); From 4e28f8275f85bf2e1e22849358ff1733fa3aa506 Mon Sep 17 00:00:00 2001 From: Ross Brunton Date: Wed, 30 Apr 2025 14:47:14 +0100 Subject: [PATCH 07/11] Fix broken cast --- unified-runtime/source/adapters/offload/program.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/unified-runtime/source/adapters/offload/program.cpp b/unified-runtime/source/adapters/offload/program.cpp index 5425cbed42095..6384d7dd3ced2 100644 --- a/unified-runtime/source/adapters/offload/program.cpp +++ b/unified-runtime/source/adapters/offload/program.cpp @@ -159,11 +159,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary( if (auto Parser = HipOffloadBundleParser::load(RealBinary, RealLength)) { std::string DevName{}; size_t DevNameLength; - olGetDeviceInfoSize(reinterpret_cast(phDevices[0]), - OL_DEVICE_INFO_NAME, &DevNameLength); + olGetDeviceInfoSize(phDevices[0]->OffloadDevice, OL_DEVICE_INFO_NAME, + &DevNameLength); DevName.resize(DevNameLength); - olGetDeviceInfo(reinterpret_cast(phDevices[0]), - OL_DEVICE_INFO_NAME, DevNameLength, DevName.data()); + olGetDeviceInfo(phDevices[0]->OffloadDevice, OL_DEVICE_INFO_NAME, + DevNameLength, DevName.data()); auto Res = Parser->extract(DevName, RealBinary, RealLength); if (Res != UR_RESULT_SUCCESS) { From a257d916e7e1a7df740bc8cfd29b06fb3d1ef7a4 Mon Sep 17 00:00:00 2001 From: Callum Fare Date: Tue, 29 Apr 2025 17:30:15 +0100 Subject: [PATCH 08/11] Fix Offload build on CUDA and detect correct targets in the CTS --- .../source/adapters/offload/CMakeLists.txt | 20 ++++++++++-------- .../source/adapters/offload/device.cpp | 19 ++++++++++------- .../source/adapters/offload/program.cpp | 17 +++++++-------- .../test/conformance/source/environment.cpp | 21 +++++++++++++------ 4 files changed, 46 insertions(+), 31 deletions(-) diff --git a/unified-runtime/source/adapters/offload/CMakeLists.txt b/unified-runtime/source/adapters/offload/CMakeLists.txt index 6461fabe647b2..bbef36f3ff8d2 100644 --- a/unified-runtime/source/adapters/offload/CMakeLists.txt +++ b/unified-runtime/source/adapters/offload/CMakeLists.txt @@ -18,8 +18,8 @@ if (NOT TARGET cudadrv) add_library(cudadrv SHARED IMPORTED GLOBAL) set_target_properties( cudadrv PROPERTIES - IMPORTED_LOCATION ${CUDA_CUDA_LIBRARY} - INTERFACE_INCLUDE_DIRECTORIES ${CUDA_INCLUDE_DIRS} + IMPORTED_LOCATION ${CUDA_cuda_driver_LIBRARY} + INTERFACE_INCLUDE_DIRECTORIES ${CUDAToolkit_INCLUDE_DIRS} ) endif() @@ -44,20 +44,22 @@ set_target_properties(${TARGET_NAME} PROPERTIES SOVERSION "${PROJECT_VERSION_MAJOR}" ) +set(ADDITIONAL_LINK_LIBS "") +if (CUDA_cuda_driver_LIBRARY) + list(APPEND ADDITIONAL_LINK_LIBS + cudadrv + ) + target_compile_definitions(${TARGET_NAME} PRIVATE UR_CUDA_ENABLED) +endif() + target_link_libraries(${TARGET_NAME} PRIVATE ${PROJECT_NAME}::headers ${PROJECT_NAME}::common ${PROJECT_NAME}::umf ${UR_OFFLOAD_INSTALL_DIR}/lib/libLLVMOffload.so + ${ADDITIONAL_LINK_LIBS} ) -if (CUDA_CUDA_LIBRARY) - target_link_libraries(${TARGET_NAME} - cudadrv - ) - target_compile_definitions(${TARGET_NAME} PRIVATE UR_CUDA_ENABLED=1) -endif() - target_include_directories(${TARGET_NAME} PRIVATE "${UR_OFFLOAD_INCLUDE_DIR}/offload" "${CMAKE_CURRENT_SOURCE_DIR}/../../" diff --git a/unified-runtime/source/adapters/offload/device.cpp b/unified-runtime/source/adapters/offload/device.cpp index d67f3555cf640..ec5929577b8dc 100644 --- a/unified-runtime/source/adapters/offload/device.cpp +++ b/unified-runtime/source/adapters/offload/device.cpp @@ -119,15 +119,20 @@ urDevicePartition(ur_device_handle_t, const ur_device_partition_properties_t *, UR_APIEXPORT ur_result_t UR_APICALL urDeviceSelectBinary( ur_device_handle_t hDevice, const ur_device_binary_t *pBinaries, uint32_t NumBinaries, uint32_t *pSelectedBinary) { - std::ignore = hDevice; - std::ignore = pBinaries; - std::ignore = NumBinaries; - std::ignore = pSelectedBinary; - // TODO: Don't hard code nvptx64! - const char *image_target = UR_DEVICE_BINARY_TARGET_NVPTX64; + ol_platform_backend_t Backend; + olGetPlatformInfo(hDevice->Platform->OffloadPlatform, + OL_PLATFORM_INFO_BACKEND, sizeof(Backend), &Backend); + + const char *ImageTarget = UR_DEVICE_BINARY_TARGET_UNKNOWN; + if (Backend == OL_PLATFORM_BACKEND_CUDA) { + ImageTarget = UR_DEVICE_BINARY_TARGET_NVPTX64; + } else if (Backend == OL_PLATFORM_BACKEND_AMDGPU) { + ImageTarget = UR_DEVICE_BINARY_TARGET_AMDGCN; + } + for (uint32_t i = 0; i < NumBinaries; ++i) { - if (strcmp(pBinaries[i].pDeviceTargetSpec, image_target) == 0) { + if (strcmp(pBinaries[i].pDeviceTargetSpec, ImageTarget) == 0) { *pSelectedBinary = i; return UR_RESULT_SUCCESS; } diff --git a/unified-runtime/source/adapters/offload/program.cpp b/unified-runtime/source/adapters/offload/program.cpp index 6384d7dd3ced2..8489b3c5b0a7e 100644 --- a/unified-runtime/source/adapters/offload/program.cpp +++ b/unified-runtime/source/adapters/offload/program.cpp @@ -4,6 +4,7 @@ #include "context.hpp" #include "device.hpp" +#include "platform.hpp" #include "program.hpp" #include "ur2offload.hpp" @@ -31,7 +32,10 @@ ur_result_t ProgramCreateCudaWorkaround(ur_context_handle_t hContext, cuLinkComplete(State, &CuBin, &CuBinSize); RealBinary = (uint8_t *)CuBin; RealLength = CuBinSize; + +#if 0 fprintf(stderr, "Performed CUDA bin workaround (size = %lu)\n", RealLength); +#endif ur_program_handle_t Program = new ur_program_handle_t_(); auto Res = olCreateProgram(hContext->Device->OffloadDevice, RealBinary, @@ -39,7 +43,6 @@ ur_result_t ProgramCreateCudaWorkaround(ur_context_handle_t hContext, // Program owns the linked module now cuLinkDestroy(State); - (void)State; if (Res != OL_SUCCESS) { delete Program; @@ -146,13 +149,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary( return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } - ur_platform_handle_t DevicePlatform; - urDeviceGetInfo(phDevices[0], UR_DEVICE_INFO_PLATFORM, - sizeof(ur_platform_handle_t), &DevicePlatform, nullptr); - ur_platform_backend_t PlatformBackend; - urPlatformGetInfo(DevicePlatform, UR_PLATFORM_INFO_BACKEND, - sizeof(ur_platform_backend_t), &PlatformBackend, nullptr); - auto *RealBinary = ppBinaries[0]; size_t RealLength = pLengths[0]; @@ -171,7 +167,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary( } } - if (PlatformBackend == UR_PLATFORM_BACKEND_CUDA) { + ol_platform_backend_t Backend; + olGetPlatformInfo(phDevices[0]->Platform->OffloadPlatform, + OL_PLATFORM_INFO_BACKEND, sizeof(Backend), &Backend); + if (Backend == OL_PLATFORM_BACKEND_CUDA) { return ProgramCreateCudaWorkaround(hContext, RealBinary, RealLength, phProgram); } diff --git a/unified-runtime/test/conformance/source/environment.cpp b/unified-runtime/test/conformance/source/environment.cpp index d2f63c37929d8..8298f662dcc51 100644 --- a/unified-runtime/test/conformance/source/environment.cpp +++ b/unified-runtime/test/conformance/source/environment.cpp @@ -216,14 +216,23 @@ std::string KernelsEnvironment::getTargetName(ur_platform_handle_t platform) { case UR_PLATFORM_BACKEND_HIP: return "amdgcn-amd-amdhsa"; case UR_PLATFORM_BACKEND_OFFLOAD: { - // TODO: In future this should use urDeviceSelectBinary - auto result = ur_getenv("UR_OFFLOAD_TARGET_NAME"); - if (!result) { - error = "For offload testing, please specify a target in " - "`UR_OFFLOAD_TARGET_NAME`"; + // All Offload platforms report this backend, use the platform name to select + // the actual underlying backend. + std::vector PlatformName; + size_t PlatformNameSize = 0; + urPlatformGetInfo(platform, UR_PLATFORM_INFO_NAME, 0, nullptr, + &PlatformNameSize); + PlatformName.resize(PlatformNameSize); + urPlatformGetInfo(platform, UR_PLATFORM_INFO_NAME, PlatformNameSize, + PlatformName.data(), nullptr); + if (std::strcmp(PlatformName.data(), "CUDA") == 0) { + return "nvptx64-nvidia-cuda"; + } else if (std::strcmp(PlatformName.data(), "AMDGPU") == 0) { + return "amdgcn-amd-amdhsa"; + } else { + error = "Could not detect target for Offload platform"; return {}; } - return *result; } case UR_PLATFORM_BACKEND_NATIVE_CPU: error = "native_cpu doesn't support kernel tests yet"; From 3794a92cacaa5d4f0192485acd0f0d1e7daa763a Mon Sep 17 00:00:00 2001 From: Callum Fare Date: Wed, 30 Apr 2025 17:36:25 +0100 Subject: [PATCH 09/11] Add missing license text --- .../source/adapters/offload/CMakeLists.txt | 5 +++++ .../source/adapters/offload/adapter.cpp | 2 +- .../source/adapters/offload/adapter.hpp | 10 ++++++++++ .../source/adapters/offload/common.hpp | 10 ++++++++++ .../source/adapters/offload/context.cpp | 10 ++++++++++ .../source/adapters/offload/context.hpp | 10 ++++++++++ .../source/adapters/offload/device.cpp | 10 ++++++++++ .../source/adapters/offload/device.hpp | 10 ++++++++++ .../source/adapters/offload/enqueue.cpp | 14 ++++++++++++-- .../source/adapters/offload/event.cpp | 10 ++++++++++ .../source/adapters/offload/event.hpp | 14 ++++++++++++-- .../source/adapters/offload/kernel.cpp | 17 +++++++++++++---- .../source/adapters/offload/kernel.hpp | 10 ++++++++++ .../source/adapters/offload/platform.cpp | 10 ++++++++++ .../source/adapters/offload/platform.hpp | 15 +++++++++++++-- .../source/adapters/offload/program.cpp | 10 ++++++++++ .../source/adapters/offload/program.hpp | 14 ++++++++++++-- .../source/adapters/offload/queue.cpp | 10 ++++++++++ .../source/adapters/offload/queue.hpp | 16 +++++++++++++--- unified-runtime/source/adapters/offload/usm.cpp | 10 ++++++++++ unified-runtime/source/loader/ur_lib.cpp | 2 +- 21 files changed, 202 insertions(+), 17 deletions(-) diff --git a/unified-runtime/source/adapters/offload/CMakeLists.txt b/unified-runtime/source/adapters/offload/CMakeLists.txt index bbef36f3ff8d2..dfcafc0fa98e5 100644 --- a/unified-runtime/source/adapters/offload/CMakeLists.txt +++ b/unified-runtime/source/adapters/offload/CMakeLists.txt @@ -1,3 +1,8 @@ +# Copyright (C) 2025 Intel Corporation +# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +# See LICENSE.TXT +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + set(UR_OFFLOAD_ADAPTER_DIR "${CMAKE_CURRENT_SOURCE_DIR}" CACHE PATH "Offload adapter directory") # TODO set(TARGET_NAME ur_adapter_offload) diff --git a/unified-runtime/source/adapters/offload/adapter.cpp b/unified-runtime/source/adapters/offload/adapter.cpp index 9371e6684ca9e..fd3608dcacb0c 100644 --- a/unified-runtime/source/adapters/offload/adapter.cpp +++ b/unified-runtime/source/adapters/offload/adapter.cpp @@ -1,4 +1,4 @@ -//===----------- adapter.cpp - LLVM Offload Plugin -----------------------===// +//===----------- adapter.cpp - LLVM Offload Adapter ----------------------===// // // Copyright (C) 2024 Intel Corporation // diff --git a/unified-runtime/source/adapters/offload/adapter.hpp b/unified-runtime/source/adapters/offload/adapter.hpp index a9fd927b55785..b85995b0f6a08 100644 --- a/unified-runtime/source/adapters/offload/adapter.hpp +++ b/unified-runtime/source/adapters/offload/adapter.hpp @@ -1,3 +1,13 @@ +//===----------- adapter.hpp - LLVM Offload Adapter ----------------------===// +// +// Copyright (C) 2025 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + #pragma once #include diff --git a/unified-runtime/source/adapters/offload/common.hpp b/unified-runtime/source/adapters/offload/common.hpp index 152714bdc2cc5..2159f9ae993a1 100644 --- a/unified-runtime/source/adapters/offload/common.hpp +++ b/unified-runtime/source/adapters/offload/common.hpp @@ -1,3 +1,13 @@ +//===----------- common.hpp - LLVM Offload Adapter -----------------------===// +// +// Copyright (C) 2024 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + #pragma once #include diff --git a/unified-runtime/source/adapters/offload/context.cpp b/unified-runtime/source/adapters/offload/context.cpp index 01d015038c3b1..5e76ab5abb256 100644 --- a/unified-runtime/source/adapters/offload/context.cpp +++ b/unified-runtime/source/adapters/offload/context.cpp @@ -1,3 +1,13 @@ +//===----------- context.cpp - LLVM Offload Adapter ----------------------===// +// +// Copyright (C) 2025 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + #include "context.hpp" #include diff --git a/unified-runtime/source/adapters/offload/context.hpp b/unified-runtime/source/adapters/offload/context.hpp index ce43d428cd0c7..64727ce3338bb 100644 --- a/unified-runtime/source/adapters/offload/context.hpp +++ b/unified-runtime/source/adapters/offload/context.hpp @@ -1,3 +1,13 @@ +//===----------- context.hpp - LLVM Offload Adapter ----------------------===// +// +// Copyright (C) 2025 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + #pragma once #include "common.hpp" diff --git a/unified-runtime/source/adapters/offload/device.cpp b/unified-runtime/source/adapters/offload/device.cpp index ec5929577b8dc..2dfa7d05ed3fe 100644 --- a/unified-runtime/source/adapters/offload/device.cpp +++ b/unified-runtime/source/adapters/offload/device.cpp @@ -1,3 +1,13 @@ +//===----------- device.cpp - LLVM Offload Adapter -----------------------===// +// +// Copyright (C) 2025 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + #include #include #include diff --git a/unified-runtime/source/adapters/offload/device.hpp b/unified-runtime/source/adapters/offload/device.hpp index b1fc24792f3b4..1f616745384e8 100644 --- a/unified-runtime/source/adapters/offload/device.hpp +++ b/unified-runtime/source/adapters/offload/device.hpp @@ -1,3 +1,13 @@ +//===----------- device.hpp - LLVM Offload Adapter -----------------------===// +// +// Copyright (C) 2025 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + #pragma once #include "common.hpp" diff --git a/unified-runtime/source/adapters/offload/enqueue.cpp b/unified-runtime/source/adapters/offload/enqueue.cpp index 30f5a099429fa..7ec26a3b25ea0 100644 --- a/unified-runtime/source/adapters/offload/enqueue.cpp +++ b/unified-runtime/source/adapters/offload/enqueue.cpp @@ -1,3 +1,13 @@ +//===----------- enqueue.cpp - LLVM Offload Adapter ----------------------===// +// +// Copyright (C) 2025 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + #include #include #include @@ -17,8 +27,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( (void)phEventWaitList; // - (void) pGlobalWorkOffset; - (void) pLocalWorkSize; + (void)pGlobalWorkOffset; + (void)pLocalWorkSize; assert(workDim == 1); diff --git a/unified-runtime/source/adapters/offload/event.cpp b/unified-runtime/source/adapters/offload/event.cpp index 5dec5fa29d113..cd92464110eeb 100644 --- a/unified-runtime/source/adapters/offload/event.cpp +++ b/unified-runtime/source/adapters/offload/event.cpp @@ -1,3 +1,13 @@ +//===----------- event.cpp - LLVM Offload Adapter ------------------------===// +// +// Copyright (C) 2025 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + #include #include diff --git a/unified-runtime/source/adapters/offload/event.hpp b/unified-runtime/source/adapters/offload/event.hpp index 95f692214e6f1..16e0dc649d2ef 100644 --- a/unified-runtime/source/adapters/offload/event.hpp +++ b/unified-runtime/source/adapters/offload/event.hpp @@ -1,10 +1,20 @@ +//===----------- event.hpp - LLVM Offload Adapter ------------------------===// +// +// Copyright (C) 2025 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + #pragma once -#include #include +#include #include "common.hpp" struct ur_event_handle_t_ : RefCounted { - ol_event_handle_t OffloadEvent; + ol_event_handle_t OffloadEvent; }; diff --git a/unified-runtime/source/adapters/offload/kernel.cpp b/unified-runtime/source/adapters/offload/kernel.cpp index 6ab95aa6640da..9195bec1f72fc 100644 --- a/unified-runtime/source/adapters/offload/kernel.cpp +++ b/unified-runtime/source/adapters/offload/kernel.cpp @@ -1,3 +1,13 @@ +//===----------- kernel.cpp - LLVM Offload Adapter -----------------------===// +// +// Copyright (C) 2025 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + #include "kernel.hpp" #include "program.hpp" #include "ur2offload.hpp" @@ -56,10 +66,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgValue( return UR_RESULT_SUCCESS; } -UR_APIEXPORT ur_result_t UR_APICALL -urKernelGetGroupInfo(ur_kernel_handle_t, ur_device_handle_t, - ur_kernel_group_info_t propName, size_t propSize, - void *pPropValue, size_t *pPropSizeRet) { +UR_APIEXPORT ur_result_t UR_APICALL urKernelGetGroupInfo( + ur_kernel_handle_t, ur_device_handle_t, ur_kernel_group_info_t propName, + size_t propSize, void *pPropValue, size_t *pPropSizeRet) { UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); if (propName == UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE) { diff --git a/unified-runtime/source/adapters/offload/kernel.hpp b/unified-runtime/source/adapters/offload/kernel.hpp index dee293aaa1b44..dea7e25d9da9e 100644 --- a/unified-runtime/source/adapters/offload/kernel.hpp +++ b/unified-runtime/source/adapters/offload/kernel.hpp @@ -1,3 +1,13 @@ +//===----------- kernel.hpp - LLVM Offload Adapter -----------------------===// +// +// Copyright (C) 2025 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + #pragma once #include diff --git a/unified-runtime/source/adapters/offload/platform.cpp b/unified-runtime/source/adapters/offload/platform.cpp index 133a653baa7ca..da18fef81d360 100644 --- a/unified-runtime/source/adapters/offload/platform.cpp +++ b/unified-runtime/source/adapters/offload/platform.cpp @@ -1,3 +1,13 @@ +//===----------- platform.cpp - LLVM Offload Adapter ---------------------===// +// +// Copyright (C) 2025 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + #include #include #include diff --git a/unified-runtime/source/adapters/offload/platform.hpp b/unified-runtime/source/adapters/offload/platform.hpp index 100e103998364..82976e56f0508 100644 --- a/unified-runtime/source/adapters/offload/platform.hpp +++ b/unified-runtime/source/adapters/offload/platform.hpp @@ -1,12 +1,23 @@ +//===----------- platform.hpp - LLVM Offload Adapter ---------------------===// +// +// Copyright (C) 2025 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + #pragma once #include "common.hpp" -#include #include +#include #include struct ur_platform_handle_t_ : ur::offload::handle_base { - ur_platform_handle_t_(ol_platform_handle_t OffloadPlatform) : handle_base(), OffloadPlatform(OffloadPlatform) {}; + ur_platform_handle_t_(ol_platform_handle_t OffloadPlatform) + : handle_base(), OffloadPlatform(OffloadPlatform) {}; ol_platform_handle_t OffloadPlatform; std::vector Devices; diff --git a/unified-runtime/source/adapters/offload/program.cpp b/unified-runtime/source/adapters/offload/program.cpp index 8489b3c5b0a7e..c35b563c24822 100644 --- a/unified-runtime/source/adapters/offload/program.cpp +++ b/unified-runtime/source/adapters/offload/program.cpp @@ -1,3 +1,13 @@ +//===----------- program.cpp - LLVM Offload Adapter ----------------------===// +// +// Copyright (C) 2025 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + #include #include #include diff --git a/unified-runtime/source/adapters/offload/program.hpp b/unified-runtime/source/adapters/offload/program.hpp index 0639ab336c5fb..1d0263aad2998 100644 --- a/unified-runtime/source/adapters/offload/program.hpp +++ b/unified-runtime/source/adapters/offload/program.hpp @@ -1,10 +1,20 @@ +//===----------- program.hpp - LLVM Offload Adapter ----------------------===// +// +// Copyright (C) 2025 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + #pragma once -#include #include +#include #include "common.hpp" struct ur_program_handle_t_ : RefCounted { - ol_program_handle_t OffloadProgram; + ol_program_handle_t OffloadProgram; }; diff --git a/unified-runtime/source/adapters/offload/queue.cpp b/unified-runtime/source/adapters/offload/queue.cpp index 32adb1f512c11..7ddb9b35c0ffa 100644 --- a/unified-runtime/source/adapters/offload/queue.cpp +++ b/unified-runtime/source/adapters/offload/queue.cpp @@ -1,3 +1,13 @@ +//===----------- queue.cpp - LLVM Offload Adapter ------------------------===// +// +// Copyright (C) 2025 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + #include #include #include diff --git a/unified-runtime/source/adapters/offload/queue.hpp b/unified-runtime/source/adapters/offload/queue.hpp index 9406d460b7401..6afe4bf15098e 100644 --- a/unified-runtime/source/adapters/offload/queue.hpp +++ b/unified-runtime/source/adapters/offload/queue.hpp @@ -1,11 +1,21 @@ +//===----------- queue.hpp - LLVM Offload Adapter ------------------------===// +// +// Copyright (C) 2025 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + #pragma once -#include #include +#include #include "common.hpp" struct ur_queue_handle_t_ : RefCounted { - ol_queue_handle_t OffloadQueue; - ol_device_handle_t OffloadDevice; + ol_queue_handle_t OffloadQueue; + ol_device_handle_t OffloadDevice; }; diff --git a/unified-runtime/source/adapters/offload/usm.cpp b/unified-runtime/source/adapters/offload/usm.cpp index f603516811f0a..497e454885f06 100644 --- a/unified-runtime/source/adapters/offload/usm.cpp +++ b/unified-runtime/source/adapters/offload/usm.cpp @@ -1,3 +1,13 @@ +//===----------- usm.cpp - LLVM Offload Adapter --------------------------===// +// +// Copyright (C) 2025 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + #include #include #include diff --git a/unified-runtime/source/loader/ur_lib.cpp b/unified-runtime/source/loader/ur_lib.cpp index c224ca00b1777..ca7c8f1bacc8f 100644 --- a/unified-runtime/source/loader/ur_lib.cpp +++ b/unified-runtime/source/loader/ur_lib.cpp @@ -259,7 +259,7 @@ ur_result_t urDeviceGetSelected(ur_platform_handle_t hPlatform, {UR_PLATFORM_BACKEND_HIP, "hip"}, {UR_PLATFORM_BACKEND_NATIVE_CPU, "native_cpu"}, {UR_PLATFORM_BACKEND_OFFLOAD, "offload"}, - }; + }; if (!hPlatform) { return UR_RESULT_ERROR_INVALID_NULL_HANDLE; From 626701db084e7a61ea50e283adb43b7ff0780016 Mon Sep 17 00:00:00 2001 From: Callum Fare Date: Thu, 1 May 2025 12:19:43 +0100 Subject: [PATCH 10/11] Address review feedback --- unified-runtime/CMakeLists.txt | 1 + unified-runtime/source/adapters/offload/.clang-format | 4 ---- unified-runtime/source/adapters/offload/CMakeLists.txt | 7 ++++--- 3 files changed, 5 insertions(+), 7 deletions(-) delete mode 100644 unified-runtime/source/adapters/offload/.clang-format diff --git a/unified-runtime/CMakeLists.txt b/unified-runtime/CMakeLists.txt index 7d1ee861b1879..0e051b402c692 100644 --- a/unified-runtime/CMakeLists.txt +++ b/unified-runtime/CMakeLists.txt @@ -47,6 +47,7 @@ option(UR_BUILD_ADAPTER_HIP "Build the HIP adapter" OFF) option(UR_BUILD_ADAPTER_NATIVE_CPU "Build the Native-CPU adapter" OFF) option(UR_BUILD_ADAPTER_ALL "Build all currently supported adapters" OFF) option(UR_BUILD_ADAPTER_L0_V2 "Build the (experimental) Level-Zero v2 adapter" OFF) +option(UR_BUILD_ADAPTER_OFFLOAD "Build the experimental Offload adapter" OFF) option(UR_STATIC_ADAPTER_L0 "Build the Level-Zero adapter as static and embed in the loader" OFF) option(UR_BUILD_EXAMPLE_CODEGEN "Build the codegen example." OFF) option(VAL_USE_LIBBACKTRACE_BACKTRACE "enable libbacktrace validation backtrace for linux" OFF) diff --git a/unified-runtime/source/adapters/offload/.clang-format b/unified-runtime/source/adapters/offload/.clang-format deleted file mode 100644 index c8daebc205b34..0000000000000 --- a/unified-runtime/source/adapters/offload/.clang-format +++ /dev/null @@ -1,4 +0,0 @@ ---- -Language: Cpp -BasedOnStyle: LLVM -... diff --git a/unified-runtime/source/adapters/offload/CMakeLists.txt b/unified-runtime/source/adapters/offload/CMakeLists.txt index dfcafc0fa98e5..c68b9aba9c623 100644 --- a/unified-runtime/source/adapters/offload/CMakeLists.txt +++ b/unified-runtime/source/adapters/offload/CMakeLists.txt @@ -3,8 +3,6 @@ # See LICENSE.TXT # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -set(UR_OFFLOAD_ADAPTER_DIR "${CMAKE_CURRENT_SOURCE_DIR}" CACHE PATH "Offload adapter directory") # TODO - set(TARGET_NAME ur_adapter_offload) set(UR_OFFLOAD_INSTALL_DIR "" CACHE PATH "Path to the directory containing libomptarget.so etc") @@ -17,7 +15,10 @@ if (UR_OFFLOAD_INCLUDE_DIR STREQUAL "") message(FATAL_ERROR "UR_OFFLOAD_INCLUDE_DIR must be defined for the Offload adapter") endif() -# For the PTX workaround we need to link with CUDA. +# When targetting CUDA devices, we need a workaround to avoid sending PTX to +# liboffload as the CUDA plugin doesn't support it yet. The workaround is to +# simply always link the incoming program so it ends up as CUBIN. Try to find +# the cuda driver so we can enable this where possible. if (NOT TARGET cudadrv) find_package(CUDA 10.1) add_library(cudadrv SHARED IMPORTED GLOBAL) From b42db9e867ba3581dfdfdb196eb4babce8a65469 Mon Sep 17 00:00:00 2001 From: Callum Fare Date: Thu, 1 May 2025 13:51:46 +0100 Subject: [PATCH 11/11] Remove unneeded struct in olIterateDevices call --- .../source/adapters/offload/adapter.cpp | 26 +++++++++---------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/unified-runtime/source/adapters/offload/adapter.cpp b/unified-runtime/source/adapters/offload/adapter.cpp index fd3608dcacb0c..9ee8ec38fa4cd 100644 --- a/unified-runtime/source/adapters/offload/adapter.cpp +++ b/unified-runtime/source/adapters/offload/adapter.cpp @@ -25,14 +25,11 @@ ur_adapter_handle_t_ Adapter{}; ur_result_t ur_adapter_handle_t_::init() { auto Res = olInit(); - struct InitUserData { - std::unordered_map TempMap; - } InitUserData{{}}; - // Discover every platform and device Res = olIterateDevices( [](ol_device_handle_t D, void *UserData) { - auto *Data = reinterpret_cast(UserData); + auto *Platforms = + reinterpret_cast(UserData); ol_platform_handle_t Platform; olGetDeviceInfo(D, OL_DEVICE_INFO_PLATFORM, sizeof(Platform), @@ -43,20 +40,21 @@ ur_result_t ur_adapter_handle_t_::init() { if (Backend == OL_PLATFORM_BACKEND_HOST) { Adapter.HostDevice = D; } else if (Backend != OL_PLATFORM_BACKEND_UNKNOWN) { - ur_platform_handle_t UrPlatform; - if (!Data->TempMap.count(Platform)) { - Adapter.Platforms.push_back(ur_platform_handle_t_{Platform}); - UrPlatform = &Adapter.Platforms.back(); - Data->TempMap.insert({Platform, UrPlatform}); - } else { - UrPlatform = Data->TempMap[Platform]; + auto URPlatform = + std::find_if(Platforms->begin(), Platforms->end(), [&](auto &P) { + return P.OffloadPlatform == Platform; + }); + + if (URPlatform == Platforms->end()) { + URPlatform = + Platforms->insert(URPlatform, ur_platform_handle_t_(Platform)); } - UrPlatform->Devices.push_back(ur_device_handle_t_{UrPlatform, D}); + URPlatform->Devices.push_back(ur_device_handle_t_{&*URPlatform, D}); } return false; }, - &InitUserData); + &Adapter.Platforms); (void)Res;