diff --git a/unified-runtime/CMakeLists.txt b/unified-runtime/CMakeLists.txt index 7d1ee861b1879..0e051b402c692 100644 --- a/unified-runtime/CMakeLists.txt +++ b/unified-runtime/CMakeLists.txt @@ -47,6 +47,7 @@ option(UR_BUILD_ADAPTER_HIP "Build the HIP adapter" OFF) option(UR_BUILD_ADAPTER_NATIVE_CPU "Build the Native-CPU adapter" OFF) option(UR_BUILD_ADAPTER_ALL "Build all currently supported adapters" OFF) option(UR_BUILD_ADAPTER_L0_V2 "Build the (experimental) Level-Zero v2 adapter" OFF) +option(UR_BUILD_ADAPTER_OFFLOAD "Build the experimental Offload adapter" OFF) option(UR_STATIC_ADAPTER_L0 "Build the Level-Zero adapter as static and embed in the loader" OFF) option(UR_BUILD_EXAMPLE_CODEGEN "Build the codegen example." OFF) option(VAL_USE_LIBBACKTRACE_BACKTRACE "enable libbacktrace validation backtrace for linux" OFF) diff --git a/unified-runtime/include/ur_api.h b/unified-runtime/include/ur_api.h index dcf05b2b066c7..36fe8ac06bf94 100644 --- a/unified-runtime/include/ur_api.h +++ b/unified-runtime/include/ur_api.h @@ -1422,6 +1422,8 @@ typedef enum ur_adapter_backend_t { UR_ADAPTER_BACKEND_HIP = 4, /// The backend is Native CPU UR_ADAPTER_BACKEND_NATIVE_CPU = 5, + /// The backend is liboffload + UR_ADAPTER_BACKEND_OFFLOAD = 0x100, /// @cond UR_ADAPTER_BACKEND_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -1801,6 +1803,8 @@ typedef enum ur_platform_backend_t { UR_PLATFORM_BACKEND_HIP = 4, /// The backend is Native CPU UR_PLATFORM_BACKEND_NATIVE_CPU = 5, + /// The backend is liboffload + UR_PLATFORM_BACKEND_OFFLOAD = 0x100, /// @cond UR_PLATFORM_BACKEND_FORCE_UINT32 = 0x7fffffff /// @endcond diff --git a/unified-runtime/include/ur_print.hpp b/unified-runtime/include/ur_print.hpp index c5333f76f478e..63656894ae0e9 100644 --- a/unified-runtime/include/ur_print.hpp +++ b/unified-runtime/include/ur_print.hpp @@ -2356,6 +2356,9 @@ inline std::ostream &operator<<(std::ostream &os, case UR_ADAPTER_BACKEND_NATIVE_CPU: os << "UR_ADAPTER_BACKEND_NATIVE_CPU"; break; + case UR_ADAPTER_BACKEND_OFFLOAD: + os << "UR_ADAPTER_BACKEND_OFFLOAD"; + break; default: os << "unknown enumerator"; break; @@ -2553,6 +2556,9 @@ inline std::ostream &operator<<(std::ostream &os, case UR_PLATFORM_BACKEND_NATIVE_CPU: os << "UR_PLATFORM_BACKEND_NATIVE_CPU"; break; + case UR_PLATFORM_BACKEND_OFFLOAD: + os << "UR_PLATFORM_BACKEND_OFFLOAD"; + break; default: os << "unknown enumerator"; break; diff --git a/unified-runtime/scripts/core/adapter.yml b/unified-runtime/scripts/core/adapter.yml index d806d48974a4f..8253104386b05 100644 --- a/unified-runtime/scripts/core/adapter.yml +++ b/unified-runtime/scripts/core/adapter.yml @@ -209,6 +209,9 @@ etors: - name: NATIVE_CPU value: "5" desc: "The backend is Native CPU" + - name: OFFLOAD + value: "0x100" + desc: "The backend is liboffload" --- #-------------------------------------------------------------------------- type: enum desc: "Minimum level of messages to be processed by the logger." diff --git a/unified-runtime/scripts/core/manifests.yml b/unified-runtime/scripts/core/manifests.yml index 6b9647852daea..da58ebb57df34 100644 --- a/unified-runtime/scripts/core/manifests.yml +++ b/unified-runtime/scripts/core/manifests.yml @@ -61,3 +61,10 @@ name: native_cpu backend: $X_ADAPTER_BACKEND_NATIVE_CPU device_types: - $X_DEVICE_TYPE_CPU +--- #-------------------------------------------------------------------------- +type: manifest +name: offload +backend: $X_ADAPTER_BACKEND_OFFLOAD +device_types: + - $X_DEVICE_TYPE_CPU + - $X_DEVICE_TYPE_GPU diff --git a/unified-runtime/scripts/core/platform.yml b/unified-runtime/scripts/core/platform.yml index 7d4edf5c0b5c0..84c7a99d6e833 100644 --- a/unified-runtime/scripts/core/platform.yml +++ b/unified-runtime/scripts/core/platform.yml @@ -279,3 +279,6 @@ etors: - name: NATIVE_CPU value: "5" desc: "The backend is Native CPU" + - name: OFFLOAD + value: "0x100" + desc: "The backend is liboffload" diff --git a/unified-runtime/source/adapters/CMakeLists.txt b/unified-runtime/source/adapters/CMakeLists.txt index 56e053d29bc3e..8c357caa21946 100644 --- a/unified-runtime/source/adapters/CMakeLists.txt +++ b/unified-runtime/source/adapters/CMakeLists.txt @@ -68,9 +68,15 @@ if(UR_BUILD_ADAPTER_OPENCL OR UR_BUILD_ADAPTER_ALL) add_ur_adapter_subdirectory(opencl) list(APPEND TEMP_LIST "opencl") endif() + if(UR_BUILD_ADAPTER_NATIVE_CPU OR UR_BUILD_ADAPTER_ALL) add_ur_adapter_subdirectory(native_cpu) list(APPEND TEMP_LIST "native_cpu") endif() +if(UR_BUILD_ADAPTER_OFFLOAD) + add_ur_adapter_subdirectory(offload) + list(APPEND TEMP_LIST "offload") +endif() + set(UR_ADAPTERS_LIST "${TEMP_LIST}" CACHE STRING "" FORCE) diff --git a/unified-runtime/source/adapters/offload/CMakeLists.txt b/unified-runtime/source/adapters/offload/CMakeLists.txt new file mode 100644 index 0000000000000..c68b9aba9c623 --- /dev/null +++ b/unified-runtime/source/adapters/offload/CMakeLists.txt @@ -0,0 +1,72 @@ +# Copyright (C) 2025 Intel Corporation +# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions. +# See LICENSE.TXT +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +set(TARGET_NAME ur_adapter_offload) + +set(UR_OFFLOAD_INSTALL_DIR "" CACHE PATH "Path to the directory containing libomptarget.so etc") +if (UR_OFFLOAD_INSTALL_DIR STREQUAL "") + message(FATAL_ERROR "UR_OFFLOAD_INSTALL_DIR must be defined for the Offload adapter") +endif() + +set(UR_OFFLOAD_INCLUDE_DIR "" CACHE PATH "Path to the directory containing LLVM headers") +if (UR_OFFLOAD_INCLUDE_DIR STREQUAL "") + message(FATAL_ERROR "UR_OFFLOAD_INCLUDE_DIR must be defined for the Offload adapter") +endif() + +# When targetting CUDA devices, we need a workaround to avoid sending PTX to +# liboffload as the CUDA plugin doesn't support it yet. The workaround is to +# simply always link the incoming program so it ends up as CUBIN. Try to find +# the cuda driver so we can enable this where possible. +if (NOT TARGET cudadrv) + find_package(CUDA 10.1) + add_library(cudadrv SHARED IMPORTED GLOBAL) + set_target_properties( + cudadrv PROPERTIES + IMPORTED_LOCATION ${CUDA_cuda_driver_LIBRARY} + INTERFACE_INCLUDE_DIRECTORIES ${CUDAToolkit_INCLUDE_DIRS} + ) +endif() + +add_ur_adapter(${TARGET_NAME} + SHARED + ${CMAKE_CURRENT_SOURCE_DIR}/adapter.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/context.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/enqueue.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/event.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/platform.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/program.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/queue.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/ur2offload.hpp + ${CMAKE_CURRENT_SOURCE_DIR}/ur_interface_loader.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/usm.cpp +) + +set_target_properties(${TARGET_NAME} PROPERTIES + VERSION "${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR}.${PROJECT_VERSION_PATCH}" + SOVERSION "${PROJECT_VERSION_MAJOR}" +) + +set(ADDITIONAL_LINK_LIBS "") +if (CUDA_cuda_driver_LIBRARY) + list(APPEND ADDITIONAL_LINK_LIBS + cudadrv + ) + target_compile_definitions(${TARGET_NAME} PRIVATE UR_CUDA_ENABLED) +endif() + +target_link_libraries(${TARGET_NAME} PRIVATE + ${PROJECT_NAME}::headers + ${PROJECT_NAME}::common + ${PROJECT_NAME}::umf + ${UR_OFFLOAD_INSTALL_DIR}/lib/libLLVMOffload.so + ${ADDITIONAL_LINK_LIBS} +) + +target_include_directories(${TARGET_NAME} PRIVATE + "${UR_OFFLOAD_INCLUDE_DIR}/offload" + "${CMAKE_CURRENT_SOURCE_DIR}/../../" +) diff --git a/unified-runtime/source/adapters/offload/adapter.cpp b/unified-runtime/source/adapters/offload/adapter.cpp new file mode 100644 index 0000000000000..9ee8ec38fa4cd --- /dev/null +++ b/unified-runtime/source/adapters/offload/adapter.cpp @@ -0,0 +1,108 @@ +//===----------- adapter.cpp - LLVM Offload Adapter ----------------------===// +// +// Copyright (C) 2024 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include + +#include "adapter.hpp" +#include "device.hpp" +#include "platform.hpp" +#include "ur/ur.hpp" +#include "ur_api.h" + +ur_adapter_handle_t_ Adapter{}; + +// Initialize liboffload and perform the initial platform and device discovery +ur_result_t ur_adapter_handle_t_::init() { + auto Res = olInit(); + + // Discover every platform and device + Res = olIterateDevices( + [](ol_device_handle_t D, void *UserData) { + auto *Platforms = + reinterpret_cast(UserData); + + ol_platform_handle_t Platform; + olGetDeviceInfo(D, OL_DEVICE_INFO_PLATFORM, sizeof(Platform), + &Platform); + ol_platform_backend_t Backend; + olGetPlatformInfo(Platform, OL_PLATFORM_INFO_BACKEND, sizeof(Backend), + &Backend); + if (Backend == OL_PLATFORM_BACKEND_HOST) { + Adapter.HostDevice = D; + } else if (Backend != OL_PLATFORM_BACKEND_UNKNOWN) { + auto URPlatform = + std::find_if(Platforms->begin(), Platforms->end(), [&](auto &P) { + return P.OffloadPlatform == Platform; + }); + + if (URPlatform == Platforms->end()) { + URPlatform = + Platforms->insert(URPlatform, ur_platform_handle_t_(Platform)); + } + + URPlatform->Devices.push_back(ur_device_handle_t_{&*URPlatform, D}); + } + return false; + }, + &Adapter.Platforms); + + (void)Res; + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urAdapterGet( + uint32_t, ur_adapter_handle_t *phAdapters, uint32_t *pNumAdapters) { + if (phAdapters) { + if (++Adapter.RefCount == 1) { + Adapter.init(); + } + *phAdapters = &Adapter; + } + if (pNumAdapters) { + *pNumAdapters = 1; + } + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urAdapterRelease(ur_adapter_handle_t) { + if (--Adapter.RefCount == 0) { + // This can crash when tracing is enabled. + // olShutDown(); + }; + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urAdapterRetain(ur_adapter_handle_t) { + Adapter.RefCount++; + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urAdapterGetInfo(ur_adapter_handle_t, + ur_adapter_info_t propName, + size_t propSize, + void *pPropValue, + size_t *pPropSizeRet) { + UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); + + switch (propName) { + case UR_ADAPTER_INFO_BACKEND: + return ReturnValue(UR_ADAPTER_BACKEND_OFFLOAD); + case UR_ADAPTER_INFO_REFERENCE_COUNT: + return ReturnValue(Adapter.RefCount.load()); + default: + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } + + return UR_RESULT_SUCCESS; +} diff --git a/unified-runtime/source/adapters/offload/adapter.hpp b/unified-runtime/source/adapters/offload/adapter.hpp new file mode 100644 index 0000000000000..b85995b0f6a08 --- /dev/null +++ b/unified-runtime/source/adapters/offload/adapter.hpp @@ -0,0 +1,32 @@ +//===----------- adapter.hpp - LLVM Offload Adapter ----------------------===// +// +// Copyright (C) 2025 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include + +#include + +#include "common.hpp" +#include "logger/ur_logger.hpp" +#include "platform.hpp" + +struct ur_adapter_handle_t_ : ur::offload::handle_base { + std::atomic_uint32_t RefCount = 0; + logger::Logger &Logger = logger::get_logger("offload"); + ol_device_handle_t HostDevice = nullptr; + std::vector Platforms; + + ur_result_t init(); +}; + +extern ur_adapter_handle_t_ Adapter; diff --git a/unified-runtime/source/adapters/offload/common.hpp b/unified-runtime/source/adapters/offload/common.hpp new file mode 100644 index 0000000000000..2159f9ae993a1 --- /dev/null +++ b/unified-runtime/source/adapters/offload/common.hpp @@ -0,0 +1,21 @@ +//===----------- common.hpp - LLVM Offload Adapter -----------------------===// +// +// Copyright (C) 2024 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include + +namespace ur::offload { +struct handle_base {}; +} // namespace ur::offload + +struct RefCounted : ur::offload::handle_base { + std::atomic_uint32_t RefCount = 1; +}; diff --git a/unified-runtime/source/adapters/offload/context.cpp b/unified-runtime/source/adapters/offload/context.cpp new file mode 100644 index 0000000000000..5e76ab5abb256 --- /dev/null +++ b/unified-runtime/source/adapters/offload/context.cpp @@ -0,0 +1,38 @@ +//===----------- context.cpp - LLVM Offload Adapter ----------------------===// +// +// Copyright (C) 2025 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "context.hpp" +#include + +UR_APIEXPORT ur_result_t UR_APICALL urContextCreate( + uint32_t DeviceCount, const ur_device_handle_t *phDevices, + const ur_context_properties_t *, ur_context_handle_t *phContext) { + if (DeviceCount > 1) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + auto Ctx = new ur_context_handle_t_(*phDevices); + *phContext = Ctx; + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urContextRetain(ur_context_handle_t hContext) { + hContext->RefCount++; + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urContextRelease(ur_context_handle_t hContext) { + if (--hContext->RefCount == 0) { + delete hContext; + } + return UR_RESULT_SUCCESS; +} diff --git a/unified-runtime/source/adapters/offload/context.hpp b/unified-runtime/source/adapters/offload/context.hpp new file mode 100644 index 0000000000000..64727ce3338bb --- /dev/null +++ b/unified-runtime/source/adapters/offload/context.hpp @@ -0,0 +1,26 @@ +//===----------- context.hpp - LLVM Offload Adapter ----------------------===// +// +// Copyright (C) 2025 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include "common.hpp" +#include +#include +#include + +struct ur_context_handle_t_ : RefCounted { + ur_context_handle_t_(ur_device_handle_t hDevice) : Device{hDevice} { + urDeviceRetain(Device); + } + ~ur_context_handle_t_() { urDeviceRelease(Device); } + + ur_device_handle_t Device; + std::unordered_map AllocTypeMap; +}; diff --git a/unified-runtime/source/adapters/offload/device.cpp b/unified-runtime/source/adapters/offload/device.cpp new file mode 100644 index 0000000000000..2dfa7d05ed3fe --- /dev/null +++ b/unified-runtime/source/adapters/offload/device.cpp @@ -0,0 +1,153 @@ +//===----------- device.cpp - LLVM Offload Adapter -----------------------===// +// +// Copyright (C) 2025 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include +#include + +#include "device.hpp" +#include "platform.hpp" +#include "ur2offload.hpp" + +UR_APIEXPORT ur_result_t UR_APICALL urDeviceGet(ur_platform_handle_t hPlatform, + ur_device_type_t, + uint32_t NumEntries, + ur_device_handle_t *phDevices, + uint32_t *pNumDevices) { + if (pNumDevices) { + *pNumDevices = static_cast(hPlatform->Devices.size()); + } + + size_t NumDevices = + std::min(static_cast(hPlatform->Devices.size()), NumEntries); + + for (size_t I = 0; I < NumDevices; I++) { + phDevices[I] = &hPlatform->Devices[I]; + } + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice, + ur_device_info_t propName, + size_t propSize, + void *pPropValue, + size_t *pPropSizeRet) { + UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); + + ol_device_info_t olInfo; + switch (propName) { + case UR_DEVICE_INFO_NAME: + olInfo = OL_DEVICE_INFO_NAME; + break; + case UR_DEVICE_INFO_PARENT_DEVICE: + return ReturnValue(nullptr); + case UR_DEVICE_INFO_VERSION: + return ReturnValue(""); + case UR_DEVICE_INFO_EXTENSIONS: + return ReturnValue(""); + case UR_DEVICE_INFO_USE_NATIVE_ASSERT: + return ReturnValue(false); + case UR_DEVICE_INFO_TYPE: + olInfo = OL_DEVICE_INFO_TYPE; + break; + case UR_DEVICE_INFO_VENDOR: + olInfo = OL_DEVICE_INFO_VENDOR; + break; + case UR_DEVICE_INFO_DRIVER_VERSION: + olInfo = OL_DEVICE_INFO_DRIVER_VERSION; + break; + case UR_DEVICE_INFO_PLATFORM: + return ReturnValue(hDevice->Platform); + break; + case UR_DEVICE_INFO_USM_SINGLE_SHARED_SUPPORT: + return ReturnValue(UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS); + case UR_DEVICE_INFO_BUILD_ON_SUBDEVICE: + return ReturnValue(false); + default: + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; + } + + if (pPropSizeRet) { + if (auto Res = + olGetDeviceInfoSize(hDevice->OffloadDevice, olInfo, pPropSizeRet)) { + return offloadResultToUR(Res); + } + } + + if (pPropValue) { + if (auto Res = olGetDeviceInfo(hDevice->OffloadDevice, olInfo, propSize, + pPropValue)) { + return offloadResultToUR(Res); + } + // Need to explicitly map this type + if (olInfo == OL_DEVICE_INFO_TYPE) { + auto urPropPtr = reinterpret_cast(pPropValue); + auto olPropPtr = reinterpret_cast(pPropValue); + + switch (*olPropPtr) { + case OL_DEVICE_TYPE_CPU: + *urPropPtr = UR_DEVICE_TYPE_CPU; + break; + case OL_DEVICE_TYPE_GPU: + *urPropPtr = UR_DEVICE_TYPE_GPU; + break; + default: + break; + } + } + } + + return UR_RESULT_SUCCESS; +} + +// Device partitioning is not supported in Offload, and won't be for some time. +// This means urDeviceRetain/Release are no-ops because all devices are root +// devices. + +UR_APIEXPORT ur_result_t UR_APICALL urDeviceRetain(ur_device_handle_t) { + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urDeviceRelease(ur_device_handle_t) { + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urDevicePartition(ur_device_handle_t, const ur_device_partition_properties_t *, + uint32_t, ur_device_handle_t *, uint32_t *) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} + +UR_APIEXPORT ur_result_t UR_APICALL urDeviceSelectBinary( + ur_device_handle_t hDevice, const ur_device_binary_t *pBinaries, + uint32_t NumBinaries, uint32_t *pSelectedBinary) { + + ol_platform_backend_t Backend; + olGetPlatformInfo(hDevice->Platform->OffloadPlatform, + OL_PLATFORM_INFO_BACKEND, sizeof(Backend), &Backend); + + const char *ImageTarget = UR_DEVICE_BINARY_TARGET_UNKNOWN; + if (Backend == OL_PLATFORM_BACKEND_CUDA) { + ImageTarget = UR_DEVICE_BINARY_TARGET_NVPTX64; + } else if (Backend == OL_PLATFORM_BACKEND_AMDGPU) { + ImageTarget = UR_DEVICE_BINARY_TARGET_AMDGCN; + } + + for (uint32_t i = 0; i < NumBinaries; ++i) { + if (strcmp(pBinaries[i].pDeviceTargetSpec, ImageTarget) == 0) { + *pSelectedBinary = i; + return UR_RESULT_SUCCESS; + } + } + + // No image can be loaded for the given device + return UR_RESULT_ERROR_INVALID_BINARY; +} diff --git a/unified-runtime/source/adapters/offload/device.hpp b/unified-runtime/source/adapters/offload/device.hpp new file mode 100644 index 0000000000000..1f616745384e8 --- /dev/null +++ b/unified-runtime/source/adapters/offload/device.hpp @@ -0,0 +1,24 @@ +//===----------- device.hpp - LLVM Offload Adapter -----------------------===// +// +// Copyright (C) 2025 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include "common.hpp" +#include +#include + +struct ur_device_handle_t_ : ur::offload::handle_base { + ur_device_handle_t_(ur_platform_handle_t Platform, + ol_device_handle_t OffloadDevice) + : handle_base(), Platform(Platform), OffloadDevice(OffloadDevice) {} + + ur_platform_handle_t Platform; + ol_device_handle_t OffloadDevice; +}; diff --git a/unified-runtime/source/adapters/offload/enqueue.cpp b/unified-runtime/source/adapters/offload/enqueue.cpp new file mode 100644 index 0000000000000..7ec26a3b25ea0 --- /dev/null +++ b/unified-runtime/source/adapters/offload/enqueue.cpp @@ -0,0 +1,61 @@ +//===----------- enqueue.cpp - LLVM Offload Adapter ----------------------===// +// +// Copyright (C) 2025 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include +#include + +#include "event.hpp" +#include "kernel.hpp" +#include "queue.hpp" +#include "ur2offload.hpp" + +UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch( + ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim, + const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize, + const size_t *pLocalWorkSize, uint32_t numEventsInWaitList, + const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) { + // Ignore wait list for now + (void)numEventsInWaitList; + (void)phEventWaitList; + // + + (void)pGlobalWorkOffset; + (void)pLocalWorkSize; + + assert(workDim == 1); + + ol_kernel_launch_size_args_t LaunchArgs; + LaunchArgs.Dimensions = workDim; + LaunchArgs.NumGroupsX = pGlobalWorkSize[0]; + LaunchArgs.NumGroupsY = 1; + LaunchArgs.NumGroupsZ = 1; + LaunchArgs.GroupSizeX = 1; + LaunchArgs.GroupSizeY = 1; + LaunchArgs.GroupSizeZ = 1; + LaunchArgs.DynSharedMemory = 0; + + ol_event_handle_t EventOut; + auto Ret = + olLaunchKernel(hQueue->OffloadQueue, hQueue->OffloadDevice, + hKernel->OffloadKernel, hKernel->Args.getStorage(), + hKernel->Args.getStorageSize(), &LaunchArgs, &EventOut); + + if (Ret != OL_SUCCESS) { + return offloadResultToUR(Ret); + } + + if (phEvent) { + auto *Event = new ur_event_handle_t_(); + Event->OffloadEvent = EventOut; + *phEvent = Event; + } + return UR_RESULT_SUCCESS; +} diff --git a/unified-runtime/source/adapters/offload/event.cpp b/unified-runtime/source/adapters/offload/event.cpp new file mode 100644 index 0000000000000..cd92464110eeb --- /dev/null +++ b/unified-runtime/source/adapters/offload/event.cpp @@ -0,0 +1,46 @@ +//===----------- event.cpp - LLVM Offload Adapter ------------------------===// +// +// Copyright (C) 2025 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include + +#include "event.hpp" +#include "ur2offload.hpp" + +UR_APIEXPORT ur_result_t UR_APICALL +urEventWait(uint32_t numEvents, const ur_event_handle_t *phEventWaitList) { + for (uint32_t i = 0; i < numEvents; i++) { + auto Res = olWaitEvent(phEventWaitList[i]->OffloadEvent); + if (Res) { + return offloadResultToUR(Res); + } + } + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEventRetain(ur_event_handle_t hEvent) { + hEvent->RefCount++; + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urEventRelease(ur_event_handle_t hEvent) { + if (--hEvent->RefCount == 0) { + // There's a small bug in olDestroyEvent that will crash. Leak the event + // in the meantime. + // auto Res = olDestroyEvent(hEvent->OffloadEvent); + // if (Res) { + // return offloadResultToUR(Res); + // } + } + + delete hEvent; + return UR_RESULT_SUCCESS; +} diff --git a/unified-runtime/source/adapters/offload/event.hpp b/unified-runtime/source/adapters/offload/event.hpp new file mode 100644 index 0000000000000..16e0dc649d2ef --- /dev/null +++ b/unified-runtime/source/adapters/offload/event.hpp @@ -0,0 +1,20 @@ +//===----------- event.hpp - LLVM Offload Adapter ------------------------===// +// +// Copyright (C) 2025 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include + +#include "common.hpp" + +struct ur_event_handle_t_ : RefCounted { + ol_event_handle_t OffloadEvent; +}; diff --git a/unified-runtime/source/adapters/offload/kernel.cpp b/unified-runtime/source/adapters/offload/kernel.cpp new file mode 100644 index 0000000000000..9195bec1f72fc --- /dev/null +++ b/unified-runtime/source/adapters/offload/kernel.cpp @@ -0,0 +1,79 @@ +//===----------- kernel.cpp - LLVM Offload Adapter -----------------------===// +// +// Copyright (C) 2025 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include "kernel.hpp" +#include "program.hpp" +#include "ur2offload.hpp" +#include +#include +#include + +UR_APIEXPORT ur_result_t UR_APICALL +urKernelCreate(ur_program_handle_t hProgram, const char *pKernelName, + ur_kernel_handle_t *phKernel) { + ur_kernel_handle_t Kernel = new ur_kernel_handle_t_; + + auto Res = olGetKernel(hProgram->OffloadProgram, pKernelName, + &Kernel->OffloadKernel); + + if (Res != OL_SUCCESS) { + delete Kernel; + return offloadResultToUR(Res); + } + + *phKernel = Kernel; + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urKernelRetain(ur_kernel_handle_t hKernel) { + hKernel->RefCount++; + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urKernelRelease(ur_kernel_handle_t hKernel) { + if (--hKernel->RefCount == 0) { + delete hKernel; + } + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urKernelSetExecInfo(ur_kernel_handle_t, ur_kernel_exec_info_t, size_t, + const ur_kernel_exec_info_properties_t *, const void *) { + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgPointer( + ur_kernel_handle_t hKernel, uint32_t argIndex, + const ur_kernel_arg_pointer_properties_t *, const void *pArgValue) { + hKernel->Args.addArg(argIndex, sizeof(pArgValue), &pArgValue); + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgValue( + ur_kernel_handle_t hKernel, uint32_t argIndex, size_t argSize, + const ur_kernel_arg_value_properties_t *, const void *pArgValue) { + hKernel->Args.addArg(argIndex, argSize, pArgValue); + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urKernelGetGroupInfo( + ur_kernel_handle_t, ur_device_handle_t, ur_kernel_group_info_t propName, + size_t propSize, void *pPropValue, size_t *pPropSizeRet) { + UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); + + if (propName == UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE) { + size_t GroupSize[3] = {0, 0, 0}; + return ReturnValue(GroupSize, 3); + } + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; +} diff --git a/unified-runtime/source/adapters/offload/kernel.hpp b/unified-runtime/source/adapters/offload/kernel.hpp new file mode 100644 index 0000000000000..dea7e25d9da9e --- /dev/null +++ b/unified-runtime/source/adapters/offload/kernel.hpp @@ -0,0 +1,62 @@ +//===----------- kernel.hpp - LLVM Offload Adapter -----------------------===// +// +// Copyright (C) 2025 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include "common.hpp" + +struct ur_kernel_handle_t_ : RefCounted { + + // Simplified version of the CUDA adapter's argument implementation + struct OffloadKernelArguments { + static constexpr size_t MaxParamBytes = 4096u; + using args_t = std::array; + using args_size_t = std::vector; + using args_ptr_t = std::vector; + args_t Storage; + size_t StorageUsed = 0; + args_size_t ParamSizes; + args_ptr_t Pointers; + + // Add an argument. If it already exists, it is replaced. Gaps are filled + // with empty arguments. + void addArg(size_t Index, size_t Size, const void *Arg) { + if (Index + 1 > Pointers.size()) { + Pointers.resize(Index + 1); + ParamSizes.resize(Index + 1); + } + ParamSizes[Index] = Size; + // Calculate the insertion point in the array. + size_t InsertPos = std::accumulate(std::begin(ParamSizes), + std::begin(ParamSizes) + Index, 0); + // Update the stored value for the argument. + std::memcpy(&Storage[InsertPos], Arg, Size); + Pointers[Index] = &Storage[InsertPos]; + } + + const args_ptr_t &getPointers() const noexcept { return Pointers; } + + const char *getStorage() const noexcept { return Storage.data(); } + + size_t getStorageSize() const noexcept { + return std::accumulate(std::begin(ParamSizes), std::end(ParamSizes), 0); + } + }; + + ol_kernel_handle_t OffloadKernel; + OffloadKernelArguments Args{}; +}; diff --git a/unified-runtime/source/adapters/offload/platform.cpp b/unified-runtime/source/adapters/offload/platform.cpp new file mode 100644 index 0000000000000..da18fef81d360 --- /dev/null +++ b/unified-runtime/source/adapters/offload/platform.cpp @@ -0,0 +1,98 @@ +//===----------- platform.cpp - LLVM Offload Adapter ---------------------===// +// +// Copyright (C) 2025 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include +#include +#include + +#include "adapter.hpp" +#include "device.hpp" +#include "ur2offload.hpp" + +UR_APIEXPORT ur_result_t UR_APICALL +urPlatformGet(ur_adapter_handle_t, uint32_t NumEntries, + ur_platform_handle_t *phPlatforms, uint32_t *pNumPlatforms) { + + if (pNumPlatforms) { + *pNumPlatforms = Adapter.Platforms.size(); + } + + if (phPlatforms) { + size_t PlatformIndex = 0; + for (auto &Platform : Adapter.Platforms) { + phPlatforms[PlatformIndex++] = &Platform; + if (PlatformIndex == NumEntries) { + break; + } + } + } + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urPlatformGetInfo(ur_platform_handle_t hPlatform, ur_platform_info_t propName, + size_t propSize, void *pPropValue, size_t *pPropSizeRet) { + UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet); + + ol_platform_info_t olInfo; + switch (propName) { + case UR_PLATFORM_INFO_NAME: + olInfo = OL_PLATFORM_INFO_NAME; + break; + case UR_PLATFORM_INFO_VENDOR_NAME: + olInfo = OL_PLATFORM_INFO_VENDOR_NAME; + break; + case UR_PLATFORM_INFO_VERSION: + olInfo = OL_PLATFORM_INFO_VERSION; + break; + case UR_PLATFORM_INFO_EXTENSIONS: + return ReturnValue(""); + case UR_PLATFORM_INFO_PROFILE: + return ReturnValue("FULL_PROFILE"); + case UR_PLATFORM_INFO_BACKEND: + return ReturnValue(UR_PLATFORM_BACKEND_OFFLOAD); + break; + default: + return UR_RESULT_ERROR_INVALID_ENUMERATION; + } + + if (pPropSizeRet) { + if (auto Res = olGetPlatformInfoSize(hPlatform->OffloadPlatform, olInfo, + pPropSizeRet)) { + return offloadResultToUR(Res); + } + } + + if (pPropValue) { + if (auto Res = olGetPlatformInfo(hPlatform->OffloadPlatform, olInfo, + propSize, pPropValue)) { + return offloadResultToUR(Res); + } + } + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urPlatformGetBackendOption(ur_platform_handle_t, const char *pFrontendOption, + const char **ppPlatformOption) { + using namespace std::literals; + if (pFrontendOption == nullptr) + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + if (pFrontendOption == "-O0"sv || pFrontendOption == "-O1"sv || + pFrontendOption == "-O2"sv || pFrontendOption == "-O3"sv || + pFrontendOption == ""sv) { + *ppPlatformOption = ""; + return UR_RESULT_SUCCESS; + } + return UR_RESULT_ERROR_INVALID_VALUE; +} diff --git a/unified-runtime/source/adapters/offload/platform.hpp b/unified-runtime/source/adapters/offload/platform.hpp new file mode 100644 index 0000000000000..82976e56f0508 --- /dev/null +++ b/unified-runtime/source/adapters/offload/platform.hpp @@ -0,0 +1,24 @@ +//===----------- platform.hpp - LLVM Offload Adapter ---------------------===// +// +// Copyright (C) 2025 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include "common.hpp" +#include +#include +#include + +struct ur_platform_handle_t_ : ur::offload::handle_base { + ur_platform_handle_t_(ol_platform_handle_t OffloadPlatform) + : handle_base(), OffloadPlatform(OffloadPlatform) {}; + + ol_platform_handle_t OffloadPlatform; + std::vector Devices; +}; diff --git a/unified-runtime/source/adapters/offload/program.cpp b/unified-runtime/source/adapters/offload/program.cpp new file mode 100644 index 0000000000000..c35b563c24822 --- /dev/null +++ b/unified-runtime/source/adapters/offload/program.cpp @@ -0,0 +1,234 @@ +//===----------- program.cpp - LLVM Offload Adapter ----------------------===// +// +// Copyright (C) 2025 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include +#include + +#include "context.hpp" +#include "device.hpp" +#include "platform.hpp" +#include "program.hpp" +#include "ur2offload.hpp" + +#ifdef UR_CUDA_ENABLED +#include +#endif + +namespace { +// Workaround for Offload not supporting PTX binaries. Force CUDA programs +// to be linked so they end up as CUBIN. +#ifdef UR_CUDA_ENABLED +ur_result_t ProgramCreateCudaWorkaround(ur_context_handle_t hContext, + const uint8_t *Binary, size_t Length, + ur_program_handle_t *phProgram) { + uint8_t *RealBinary; + size_t RealLength; + CUlinkState State; + cuLinkCreate(0, nullptr, nullptr, &State); + + cuLinkAddData(State, CU_JIT_INPUT_PTX, (char *)(Binary), Length, nullptr, 0, + nullptr, nullptr); + + void *CuBin = nullptr; + size_t CuBinSize = 0; + cuLinkComplete(State, &CuBin, &CuBinSize); + RealBinary = (uint8_t *)CuBin; + RealLength = CuBinSize; + +#if 0 + fprintf(stderr, "Performed CUDA bin workaround (size = %lu)\n", RealLength); +#endif + + ur_program_handle_t Program = new ur_program_handle_t_(); + auto Res = olCreateProgram(hContext->Device->OffloadDevice, RealBinary, + RealLength, &Program->OffloadProgram); + + // Program owns the linked module now + cuLinkDestroy(State); + + if (Res != OL_SUCCESS) { + delete Program; + return offloadResultToUR(Res); + } + + *phProgram = Program; + + return UR_RESULT_SUCCESS; +} +#else +ur_result_t ProgramCreateCudaWorkaround(ur_context_handle_t, const uint8_t *, + size_t, ur_program_handle_t *) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; +} +#endif + +// https://clang.llvm.org/docs/ClangOffloadBundler.html#bundled-binary-file-layout +class HipOffloadBundleParser { + static constexpr std::string_view Magic = "__CLANG_OFFLOAD_BUNDLE__"; + const uint8_t *Buff; + size_t Length; + + struct __attribute__((packed)) BundleEntry { + uint64_t ObjectOffset; + uint64_t ObjectSize; + uint64_t EntryIdSize; + char EntryIdStart; + }; + + struct __attribute__((packed)) BundleHeader { + const char HeaderMagic[Magic.size()]; + uint64_t EntryCount; + BundleEntry FirstEntry; + }; + + HipOffloadBundleParser() = delete; + HipOffloadBundleParser(const uint8_t *Buff, size_t Length) + : Buff(Buff), Length(Length) {} + +public: + static std::optional load(const uint8_t *Buff, + size_t Length) { + if (std::string_view{reinterpret_cast(Buff), Length}.find( + Magic) != 0) { + return std::nullopt; + } + return HipOffloadBundleParser(Buff, Length); + } + + ur_result_t extract(std::string_view SearchTargetId, + const uint8_t *&OutBinary, size_t &OutLength) { + const char *Limit = reinterpret_cast(&Buff[Length]); + + // The different check here means that a binary consisting of only the magic + // bytes (but nothing else) will result in INVALID_PROGRAM rather than being + // treated as a non-bundle + auto *Header = reinterpret_cast(Buff); + if (reinterpret_cast(&Header->FirstEntry) > Limit) { + return UR_RESULT_ERROR_INVALID_PROGRAM; + } + + const auto *CurrentEntry = &Header->FirstEntry; + for (uint64_t I = 0; I < Header->EntryCount; I++) { + if (&CurrentEntry->EntryIdStart > Limit) { + return UR_RESULT_ERROR_INVALID_PROGRAM; + } + auto EntryId = std::string_view(&CurrentEntry->EntryIdStart, + CurrentEntry->EntryIdSize); + if (EntryId.end() > Limit) { + return UR_RESULT_ERROR_INVALID_PROGRAM; + } + + // Will match either "hip" or "hipv4" + bool isHip = EntryId.find("hip") == 0; + bool VersionMatches = + EntryId.find_last_of(SearchTargetId) == EntryId.size() - 1; + + if (isHip && VersionMatches) { + OutBinary = reinterpret_cast( + &Buff[CurrentEntry->ObjectOffset]); + OutLength = CurrentEntry->ObjectSize; + + if (reinterpret_cast(&OutBinary[OutLength]) > Limit) { + return UR_RESULT_ERROR_INVALID_PROGRAM; + } + return UR_RESULT_SUCCESS; + } + + CurrentEntry = reinterpret_cast(EntryId.end()); + } + + return UR_RESULT_ERROR_INVALID_PROGRAM; + } +}; + +} // namespace + +UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary( + ur_context_handle_t hContext, uint32_t numDevices, + ur_device_handle_t *phDevices, size_t *pLengths, const uint8_t **ppBinaries, + const ur_program_properties_t *, ur_program_handle_t *phProgram) { + if (numDevices > 1) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + + auto *RealBinary = ppBinaries[0]; + size_t RealLength = pLengths[0]; + + if (auto Parser = HipOffloadBundleParser::load(RealBinary, RealLength)) { + std::string DevName{}; + size_t DevNameLength; + olGetDeviceInfoSize(phDevices[0]->OffloadDevice, OL_DEVICE_INFO_NAME, + &DevNameLength); + DevName.resize(DevNameLength); + olGetDeviceInfo(phDevices[0]->OffloadDevice, OL_DEVICE_INFO_NAME, + DevNameLength, DevName.data()); + + auto Res = Parser->extract(DevName, RealBinary, RealLength); + if (Res != UR_RESULT_SUCCESS) { + return Res; + } + } + + ol_platform_backend_t Backend; + olGetPlatformInfo(phDevices[0]->Platform->OffloadPlatform, + OL_PLATFORM_INFO_BACKEND, sizeof(Backend), &Backend); + if (Backend == OL_PLATFORM_BACKEND_CUDA) { + return ProgramCreateCudaWorkaround(hContext, RealBinary, RealLength, + phProgram); + } + + ur_program_handle_t Program = new ur_program_handle_t_(); + auto Res = olCreateProgram(hContext->Device->OffloadDevice, RealBinary, + RealLength, &Program->OffloadProgram); + + if (Res != OL_SUCCESS) { + delete Program; + return offloadResultToUR(Res); + } + + *phProgram = Program; + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urProgramBuild(ur_context_handle_t, + ur_program_handle_t, + const char *) { + // Do nothing, program is built upon creation + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urProgramBuildExp(ur_program_handle_t, + uint32_t, + ur_device_handle_t *, + const char *) { + // Do nothing, program is built upon creation + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urProgramRetain(ur_program_handle_t hProgram) { + hProgram->RefCount++; + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL +urProgramRelease(ur_program_handle_t hProgram) { + if (--hProgram->RefCount == 0) { + auto Res = olDestroyProgram(hProgram->OffloadProgram); + if (Res) { + return offloadResultToUR(Res); + } + delete hProgram; + } + + return UR_RESULT_SUCCESS; +} diff --git a/unified-runtime/source/adapters/offload/program.hpp b/unified-runtime/source/adapters/offload/program.hpp new file mode 100644 index 0000000000000..1d0263aad2998 --- /dev/null +++ b/unified-runtime/source/adapters/offload/program.hpp @@ -0,0 +1,20 @@ +//===----------- program.hpp - LLVM Offload Adapter ----------------------===// +// +// Copyright (C) 2025 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include + +#include "common.hpp" + +struct ur_program_handle_t_ : RefCounted { + ol_program_handle_t OffloadProgram; +}; diff --git a/unified-runtime/source/adapters/offload/queue.cpp b/unified-runtime/source/adapters/offload/queue.cpp new file mode 100644 index 0000000000000..7ddb9b35c0ffa --- /dev/null +++ b/unified-runtime/source/adapters/offload/queue.cpp @@ -0,0 +1,59 @@ +//===----------- queue.cpp - LLVM Offload Adapter ------------------------===// +// +// Copyright (C) 2025 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include +#include + +#include "context.hpp" +#include "device.hpp" +#include "queue.hpp" +#include "ur2offload.hpp" + +UR_APIEXPORT ur_result_t UR_APICALL urQueueCreate( + [[maybe_unused]] ur_context_handle_t hContext, ur_device_handle_t hDevice, + const ur_queue_properties_t *, ur_queue_handle_t *phQueue) { + + assert(hContext->Device == hDevice); + + ur_queue_handle_t Queue = new ur_queue_handle_t_(); + auto Res = olCreateQueue(hDevice->OffloadDevice, &Queue->OffloadQueue); + if (Res != OL_SUCCESS) { + delete Queue; + return offloadResultToUR(Res); + } + + Queue->OffloadDevice = hDevice->OffloadDevice; + + *phQueue = Queue; + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urQueueRetain(ur_queue_handle_t hQueue) { + hQueue->RefCount++; + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urQueueRelease(ur_queue_handle_t hQueue) { + if (--hQueue->RefCount == 0) { + auto Res = olDestroyQueue(hQueue->OffloadQueue); + if (Res) { + return offloadResultToUR(Res); + } + delete hQueue; + } + + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urQueueFinish(ur_queue_handle_t hQueue) { + return offloadResultToUR(olWaitQueue(hQueue->OffloadQueue)); +} diff --git a/unified-runtime/source/adapters/offload/queue.hpp b/unified-runtime/source/adapters/offload/queue.hpp new file mode 100644 index 0000000000000..6afe4bf15098e --- /dev/null +++ b/unified-runtime/source/adapters/offload/queue.hpp @@ -0,0 +1,21 @@ +//===----------- queue.hpp - LLVM Offload Adapter ------------------------===// +// +// Copyright (C) 2025 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include + +#include "common.hpp" + +struct ur_queue_handle_t_ : RefCounted { + ol_queue_handle_t OffloadQueue; + ol_device_handle_t OffloadDevice; +}; diff --git a/unified-runtime/source/adapters/offload/ur2offload.hpp b/unified-runtime/source/adapters/offload/ur2offload.hpp new file mode 100644 index 0000000000000..2e9835bc480d0 --- /dev/null +++ b/unified-runtime/source/adapters/offload/ur2offload.hpp @@ -0,0 +1,29 @@ +//===--------- ur2offload.hpp - LLVM Offload Adapter ----------------------===// +// +// Copyright (C) 2024 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include + +inline ur_result_t offloadResultToUR(ol_result_t Result) { + if (Result == OL_SUCCESS) { + return UR_RESULT_SUCCESS; + } + + switch (Result->Code) { + case OL_ERRC_INVALID_NULL_HANDLE: + return UR_RESULT_ERROR_INVALID_NULL_HANDLE; + case OL_ERRC_INVALID_NULL_POINTER: + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + case OL_ERRC_UNSUPPORTED_ENUMERATION: + return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION; + default: + return UR_RESULT_ERROR_UNKNOWN; + } +} diff --git a/unified-runtime/source/adapters/offload/ur_interface_loader.cpp b/unified-runtime/source/adapters/offload/ur_interface_loader.cpp new file mode 100644 index 0000000000000..789bd653ea4bb --- /dev/null +++ b/unified-runtime/source/adapters/offload/ur_interface_loader.cpp @@ -0,0 +1,412 @@ +//===----------- ur_interface_loader.cpp - LLVM Offload Plugin -----------===// +// +// Copyright (C) 2024 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include + +namespace { + +// TODO - this is a duplicate of what is in the L0 plugin +// We should move this to somewhere common +ur_result_t validateProcInputs(ur_api_version_t version, void *pDdiTable) { + if (pDdiTable == nullptr) { + return UR_RESULT_ERROR_INVALID_NULL_POINTER; + } + // Pre 1.0 we enforce that loader and adapter must have the same version. + // Post 1.0 only a major version match should be required. + if (version != UR_API_VERSION_CURRENT) { + return UR_RESULT_ERROR_UNSUPPORTED_VERSION; + } + return UR_RESULT_SUCCESS; +} +} // namespace + +extern "C" { + +UR_DLLEXPORT ur_result_t UR_APICALL urGetPlatformProcAddrTable( + ur_api_version_t version, ur_platform_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnCreateWithNativeHandle = nullptr; + pDdiTable->pfnGet = urPlatformGet; + pDdiTable->pfnGetApiVersion = nullptr; + pDdiTable->pfnGetInfo = urPlatformGetInfo; + pDdiTable->pfnGetNativeHandle = nullptr; + pDdiTable->pfnGetBackendOption = urPlatformGetBackendOption; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetContextProcAddrTable( + ur_api_version_t version, ur_context_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnCreate = urContextCreate; + pDdiTable->pfnCreateWithNativeHandle = nullptr; + pDdiTable->pfnGetInfo = nullptr; + pDdiTable->pfnGetNativeHandle = nullptr; + pDdiTable->pfnRelease = urContextRelease; + pDdiTable->pfnRetain = urContextRetain; + pDdiTable->pfnSetExtendedDeleter = nullptr; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetEventProcAddrTable( + ur_api_version_t version, ur_event_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnCreateWithNativeHandle = nullptr; + pDdiTable->pfnGetInfo = nullptr; + pDdiTable->pfnGetNativeHandle = nullptr; + pDdiTable->pfnGetProfilingInfo = nullptr; + pDdiTable->pfnRelease = urEventRelease; + pDdiTable->pfnRetain = urEventRetain; + pDdiTable->pfnSetCallback = nullptr; + pDdiTable->pfnWait = urEventWait; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetProgramProcAddrTable( + ur_api_version_t version, ur_program_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnBuild = urProgramBuild; + pDdiTable->pfnCompile = nullptr; + pDdiTable->pfnCreateWithBinary = urProgramCreateWithBinary; + pDdiTable->pfnCreateWithIL = nullptr; + pDdiTable->pfnCreateWithNativeHandle = nullptr; + pDdiTable->pfnGetBuildInfo = nullptr; + pDdiTable->pfnGetFunctionPointer = nullptr; + pDdiTable->pfnGetGlobalVariablePointer = nullptr; + pDdiTable->pfnGetInfo = nullptr; + pDdiTable->pfnGetNativeHandle = nullptr; + pDdiTable->pfnLink = nullptr; + pDdiTable->pfnRelease = urProgramRelease; + pDdiTable->pfnRetain = urProgramRetain; + pDdiTable->pfnSetSpecializationConstants = nullptr; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable( + ur_api_version_t version, ur_kernel_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnCreate = urKernelCreate; + pDdiTable->pfnCreateWithNativeHandle = nullptr; + pDdiTable->pfnGetGroupInfo = urKernelGetGroupInfo; + pDdiTable->pfnGetInfo = nullptr; + pDdiTable->pfnGetNativeHandle = nullptr; + pDdiTable->pfnGetSubGroupInfo = nullptr; + pDdiTable->pfnRelease = urKernelRelease; + pDdiTable->pfnRetain = urKernelRetain; + pDdiTable->pfnSetArgLocal = nullptr; + pDdiTable->pfnSetArgMemObj = nullptr; + pDdiTable->pfnSetArgPointer = urKernelSetArgPointer; + pDdiTable->pfnSetArgSampler = nullptr; + pDdiTable->pfnSetArgValue = urKernelSetArgValue; + pDdiTable->pfnSetExecInfo = urKernelSetExecInfo; + pDdiTable->pfnSetSpecializationConstants = nullptr; + pDdiTable->pfnGetSuggestedLocalWorkSize = nullptr; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetSamplerProcAddrTable( + ur_api_version_t version, ur_sampler_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnCreate = nullptr; + pDdiTable->pfnCreateWithNativeHandle = nullptr; + pDdiTable->pfnGetInfo = nullptr; + pDdiTable->pfnGetNativeHandle = nullptr; + pDdiTable->pfnRelease = nullptr; + pDdiTable->pfnRetain = nullptr; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL +urGetMemProcAddrTable(ur_api_version_t version, ur_mem_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnBufferCreate = nullptr; + pDdiTable->pfnBufferPartition = nullptr; + pDdiTable->pfnBufferCreateWithNativeHandle = nullptr; + pDdiTable->pfnImageCreateWithNativeHandle = nullptr; + pDdiTable->pfnGetInfo = nullptr; + pDdiTable->pfnGetNativeHandle = nullptr; + pDdiTable->pfnImageCreate = nullptr; + pDdiTable->pfnImageGetInfo = nullptr; + pDdiTable->pfnRelease = nullptr; + pDdiTable->pfnRetain = nullptr; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueProcAddrTable( + ur_api_version_t version, ur_enqueue_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnDeviceGlobalVariableRead = nullptr; + pDdiTable->pfnDeviceGlobalVariableWrite = nullptr; + pDdiTable->pfnEventsWait = nullptr; + pDdiTable->pfnEventsWaitWithBarrier = nullptr; + pDdiTable->pfnKernelLaunch = urEnqueueKernelLaunch; + pDdiTable->pfnMemBufferCopy = nullptr; + pDdiTable->pfnMemBufferCopyRect = nullptr; + pDdiTable->pfnMemBufferFill = nullptr; + pDdiTable->pfnMemBufferMap = nullptr; + pDdiTable->pfnMemBufferRead = nullptr; + pDdiTable->pfnMemBufferReadRect = nullptr; + pDdiTable->pfnMemBufferWrite = nullptr; + pDdiTable->pfnMemBufferWriteRect = nullptr; + pDdiTable->pfnMemImageCopy = nullptr; + pDdiTable->pfnMemImageRead = nullptr; + pDdiTable->pfnMemImageWrite = nullptr; + pDdiTable->pfnMemUnmap = nullptr; + pDdiTable->pfnUSMFill2D = nullptr; + pDdiTable->pfnUSMFill = nullptr; + pDdiTable->pfnUSMAdvise = nullptr; + pDdiTable->pfnUSMMemcpy2D = nullptr; + pDdiTable->pfnUSMMemcpy = nullptr; + pDdiTable->pfnUSMPrefetch = nullptr; + pDdiTable->pfnReadHostPipe = nullptr; + pDdiTable->pfnWriteHostPipe = nullptr; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetGlobalProcAddrTable( + ur_api_version_t version, ur_global_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnAdapterGet = urAdapterGet; + pDdiTable->pfnAdapterGetInfo = urAdapterGetInfo; + pDdiTable->pfnAdapterRelease = urAdapterRelease; + pDdiTable->pfnAdapterRetain = urAdapterRetain; + pDdiTable->pfnAdapterGetLastError = nullptr; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetQueueProcAddrTable( + ur_api_version_t version, ur_queue_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnCreate = urQueueCreate; + pDdiTable->pfnCreateWithNativeHandle = nullptr; + pDdiTable->pfnFinish = urQueueFinish; + pDdiTable->pfnFlush = nullptr; + pDdiTable->pfnGetInfo = nullptr; + pDdiTable->pfnGetNativeHandle = nullptr; + pDdiTable->pfnRelease = urQueueRelease; + pDdiTable->pfnRetain = urQueueRetain; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL +urGetUSMProcAddrTable(ur_api_version_t version, ur_usm_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnDeviceAlloc = urUSMDeviceAlloc; + pDdiTable->pfnFree = urUSMFree; + pDdiTable->pfnGetMemAllocInfo = nullptr; + pDdiTable->pfnHostAlloc = urUSMHostAlloc; + pDdiTable->pfnPoolCreate = nullptr; + pDdiTable->pfnPoolRetain = nullptr; + pDdiTable->pfnPoolRelease = nullptr; + pDdiTable->pfnPoolGetInfo = nullptr; + pDdiTable->pfnSharedAlloc = urUSMSharedAlloc; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetDeviceProcAddrTable( + ur_api_version_t version, ur_device_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnCreateWithNativeHandle = nullptr; + pDdiTable->pfnGet = urDeviceGet; + pDdiTable->pfnGetGlobalTimestamps = nullptr; + pDdiTable->pfnGetInfo = urDeviceGetInfo; + pDdiTable->pfnGetNativeHandle = nullptr; + pDdiTable->pfnPartition = urDevicePartition; + pDdiTable->pfnRelease = urDeviceRelease; + pDdiTable->pfnRetain = urDeviceRetain; + pDdiTable->pfnSelectBinary = urDeviceSelectBinary; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetCommandBufferExpProcAddrTable( + ur_api_version_t version, ur_command_buffer_exp_dditable_t *pDdiTable) { + auto retVal = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != retVal) { + return retVal; + } + pDdiTable->pfnCreateExp = nullptr; + pDdiTable->pfnRetainExp = nullptr; + pDdiTable->pfnReleaseExp = nullptr; + pDdiTable->pfnFinalizeExp = nullptr; + pDdiTable->pfnAppendKernelLaunchExp = nullptr; + pDdiTable->pfnAppendUSMMemcpyExp = nullptr; + pDdiTable->pfnAppendMemBufferCopyExp = nullptr; + pDdiTable->pfnAppendMemBufferCopyRectExp = nullptr; + pDdiTable->pfnAppendMemBufferReadExp = nullptr; + pDdiTable->pfnAppendMemBufferReadRectExp = nullptr; + pDdiTable->pfnAppendMemBufferWriteExp = nullptr; + pDdiTable->pfnAppendMemBufferWriteRectExp = nullptr; + pDdiTable->pfnUpdateKernelLaunchExp = nullptr; + pDdiTable->pfnGetInfoExp = nullptr; + + return retVal; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetUsmP2PExpProcAddrTable( + ur_api_version_t version, ur_usm_p2p_exp_dditable_t *pDdiTable) { + auto retVal = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != retVal) { + return retVal; + } + pDdiTable->pfnEnablePeerAccessExp = nullptr; + pDdiTable->pfnDisablePeerAccessExp = nullptr; + pDdiTable->pfnPeerAccessGetInfoExp = nullptr; + + return retVal; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetBindlessImagesExpProcAddrTable( + ur_api_version_t version, ur_bindless_images_exp_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnUnsampledImageHandleDestroyExp = nullptr; + pDdiTable->pfnSampledImageHandleDestroyExp = nullptr; + pDdiTable->pfnImageAllocateExp = nullptr; + pDdiTable->pfnImageFreeExp = nullptr; + pDdiTable->pfnUnsampledImageCreateExp = nullptr; + pDdiTable->pfnSampledImageCreateExp = nullptr; + pDdiTable->pfnImageCopyExp = nullptr; + pDdiTable->pfnImageGetInfoExp = nullptr; + pDdiTable->pfnMipmapGetLevelExp = nullptr; + pDdiTable->pfnMipmapFreeExp = nullptr; + pDdiTable->pfnImportExternalMemoryExp = nullptr; + pDdiTable->pfnMapExternalArrayExp = nullptr; + pDdiTable->pfnMapExternalLinearMemoryExp = nullptr; + pDdiTable->pfnReleaseExternalMemoryExp = nullptr; + pDdiTable->pfnImportExternalSemaphoreExp = nullptr; + pDdiTable->pfnReleaseExternalSemaphoreExp = nullptr; + pDdiTable->pfnWaitExternalSemaphoreExp = nullptr; + pDdiTable->pfnSignalExternalSemaphoreExp = nullptr; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetPhysicalMemProcAddrTable( + ur_api_version_t version, ur_physical_mem_dditable_t *pDdiTable) { + auto retVal = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != retVal) { + return retVal; + } + + pDdiTable->pfnCreate = nullptr; + pDdiTable->pfnRelease = nullptr; + pDdiTable->pfnRetain = nullptr; + + return retVal; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetUSMExpProcAddrTable( + ur_api_version_t version, ur_usm_exp_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + pDdiTable->pfnPitchedAllocExp = nullptr; + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetVirtualMemProcAddrTable( + ur_api_version_t version, ///< [in] API version requested + ur_virtual_mem_dditable_t + *pDdiTable ///< [in,out] pointer to table of DDI function pointers +) { + auto retVal = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != retVal) { + return retVal; + } + + pDdiTable->pfnFree = nullptr; + pDdiTable->pfnGetInfo = nullptr; + pDdiTable->pfnGranularityGetInfo = nullptr; + pDdiTable->pfnMap = nullptr; + pDdiTable->pfnReserve = nullptr; + pDdiTable->pfnSetAccess = nullptr; + pDdiTable->pfnUnmap = nullptr; + + return retVal; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable( + ur_api_version_t version, ur_enqueue_exp_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + + pDdiTable->pfnCooperativeKernelLaunchExp = nullptr; + pDdiTable->pfnTimestampRecordingExp = nullptr; + pDdiTable->pfnNativeCommandExp = nullptr; + + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelExpProcAddrTable( + ur_api_version_t version, ur_kernel_exp_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + + pDdiTable->pfnSuggestMaxCooperativeGroupCountExp = nullptr; + + return UR_RESULT_SUCCESS; +} + +UR_DLLEXPORT ur_result_t UR_APICALL urGetProgramExpProcAddrTable( + ur_api_version_t version, ur_program_exp_dditable_t *pDdiTable) { + auto result = validateProcInputs(version, pDdiTable); + if (UR_RESULT_SUCCESS != result) { + return result; + } + + pDdiTable->pfnBuildExp = urProgramBuildExp; + pDdiTable->pfnCompileExp = nullptr; + pDdiTable->pfnLinkExp = nullptr; + + return UR_RESULT_SUCCESS; +} +} // extern "C" diff --git a/unified-runtime/source/adapters/offload/usm.cpp b/unified-runtime/source/adapters/offload/usm.cpp new file mode 100644 index 0000000000000..497e454885f06 --- /dev/null +++ b/unified-runtime/source/adapters/offload/usm.cpp @@ -0,0 +1,64 @@ +//===----------- usm.cpp - LLVM Offload Adapter --------------------------===// +// +// Copyright (C) 2025 Intel Corporation +// +// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM +// Exceptions. See LICENSE.TXT +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#include +#include +#include + +#include "context.hpp" +#include "device.hpp" +#include "ur2offload.hpp" + +UR_APIEXPORT ur_result_t UR_APICALL urUSMHostAlloc(ur_context_handle_t hContext, + const ur_usm_desc_t *, + ur_usm_pool_handle_t, + size_t size, void **ppMem) { + auto Res = olMemAlloc(hContext->Device->OffloadDevice, OL_ALLOC_TYPE_HOST, + size, ppMem); + + if (Res != OL_SUCCESS) { + return offloadResultToUR(Res); + } + + hContext->AllocTypeMap.insert_or_assign(*ppMem, OL_ALLOC_TYPE_HOST); + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urUSMDeviceAlloc( + ur_context_handle_t hContext, ur_device_handle_t, const ur_usm_desc_t *, + ur_usm_pool_handle_t, size_t size, void **ppMem) { + auto Res = olMemAlloc(hContext->Device->OffloadDevice, OL_ALLOC_TYPE_DEVICE, + size, ppMem); + + if (Res != OL_SUCCESS) { + return offloadResultToUR(Res); + } + + hContext->AllocTypeMap.insert_or_assign(*ppMem, OL_ALLOC_TYPE_DEVICE); + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urUSMSharedAlloc( + ur_context_handle_t hContext, ur_device_handle_t, const ur_usm_desc_t *, + ur_usm_pool_handle_t, size_t size, void **ppMem) { + auto Res = olMemAlloc(hContext->Device->OffloadDevice, OL_ALLOC_TYPE_MANAGED, + size, ppMem); + + if (Res != OL_SUCCESS) { + return offloadResultToUR(Res); + } + + hContext->AllocTypeMap.insert_or_assign(*ppMem, OL_ALLOC_TYPE_MANAGED); + return UR_RESULT_SUCCESS; +} + +UR_APIEXPORT ur_result_t UR_APICALL urUSMFree(ur_context_handle_t, void *pMem) { + return offloadResultToUR(olMemFree(pMem)); +} diff --git a/unified-runtime/source/loader/ur_adapter_registry.hpp b/unified-runtime/source/loader/ur_adapter_registry.hpp index 53fed55ea0c8a..36e5e9a602756 100644 --- a/unified-runtime/source/loader/ur_adapter_registry.hpp +++ b/unified-runtime/source/loader/ur_adapter_registry.hpp @@ -40,6 +40,7 @@ struct FilterTerm { {"cuda", UR_ADAPTER_BACKEND_CUDA}, {"hip", UR_ADAPTER_BACKEND_HIP}, {"native_cpu", UR_ADAPTER_BACKEND_NATIVE_CPU}, + {"offload", UR_ADAPTER_BACKEND_OFFLOAD}, }; bool matchesBackend(const ur_adapter_backend_t &match_backend) const { diff --git a/unified-runtime/source/loader/ur_lib.cpp b/unified-runtime/source/loader/ur_lib.cpp index 8163be0fbaebf..ca7c8f1bacc8f 100644 --- a/unified-runtime/source/loader/ur_lib.cpp +++ b/unified-runtime/source/loader/ur_lib.cpp @@ -251,13 +251,15 @@ ur_result_t urDeviceGetSelected(ur_platform_handle_t hPlatform, uint32_t NumEntries, ur_device_handle_t *phDevices, uint32_t *pNumDevices) { - constexpr std::pair adapters[6] = { + constexpr std::pair adapters[7] = { {UR_PLATFORM_BACKEND_UNKNOWN, "*"}, {UR_PLATFORM_BACKEND_LEVEL_ZERO, "level_zero"}, {UR_PLATFORM_BACKEND_OPENCL, "opencl"}, {UR_PLATFORM_BACKEND_CUDA, "cuda"}, {UR_PLATFORM_BACKEND_HIP, "hip"}, - {UR_PLATFORM_BACKEND_NATIVE_CPU, "native_cpu"}}; + {UR_PLATFORM_BACKEND_NATIVE_CPU, "native_cpu"}, + {UR_PLATFORM_BACKEND_OFFLOAD, "offload"}, + }; if (!hPlatform) { return UR_RESULT_ERROR_INVALID_NULL_HANDLE; diff --git a/unified-runtime/source/loader/ur_manifests.hpp b/unified-runtime/source/loader/ur_manifests.hpp index 2ed89fc79f4f0..9981d8d1fa5bf 100644 --- a/unified-runtime/source/loader/ur_manifests.hpp +++ b/unified-runtime/source/loader/ur_manifests.hpp @@ -79,5 +79,12 @@ const std::vector ur_adapter_manifests = { { UR_DEVICE_TYPE_CPU, }}, + {"offload", + MAKE_LIBRARY_NAME("ur_adapter_offload", "0"), + UR_ADAPTER_BACKEND_OFFLOAD, + { + UR_DEVICE_TYPE_CPU, + UR_DEVICE_TYPE_GPU, + }}, }; } // namespace ur_loader diff --git a/unified-runtime/test/conformance/CMakeLists.txt b/unified-runtime/test/conformance/CMakeLists.txt index 9e29e727e2e2a..182b80affb423 100644 --- a/unified-runtime/test/conformance/CMakeLists.txt +++ b/unified-runtime/test/conformance/CMakeLists.txt @@ -125,13 +125,13 @@ if(UR_FOUND_DPCXX) file(MAKE_DIRECTORY ${UR_CONFORMANCE_DEVICE_BINARIES_DIR}) if("${UR_CONFORMANCE_TARGET_TRIPLES}" STREQUAL "") - if(UR_BUILD_ADAPTER_L0 OR UR_BUILD_ADAPTER_L0_V2 OR UR_BUILD_ADAPTER_OPENCL OR UR_BUILD_ADAPTER_ALL) + if(UR_BUILD_ADAPTER_L0 OR UR_BUILD_ADAPTER_OFFLOAD OR UR_BUILD_ADAPTER_L0_V2 OR UR_BUILD_ADAPTER_OPENCL OR UR_BUILD_ADAPTER_ALL) list(APPEND TARGET_TRIPLES "spir64") endif() - if(UR_BUILD_ADAPTER_CUDA OR UR_BUILD_ADAPTER_ALL) + if(UR_BUILD_ADAPTER_CUDA OR UR_BUILD_ADAPTER_OFFLOAD OR UR_BUILD_ADAPTER_ALL) list(APPEND TARGET_TRIPLES "nvptx64-nvidia-cuda") endif() - if(UR_BUILD_ADAPTER_HIP OR UR_BUILD_ADAPTER_ALL) + if(UR_BUILD_ADAPTER_HIP OR UR_BUILD_ADAPTER_OFFLOAD OR UR_BUILD_ADAPTER_ALL) list(APPEND TARGET_TRIPLES "amdgcn-amd-amdhsa") endif() else() diff --git a/unified-runtime/test/conformance/platform/urPlatformGetInfo.cpp b/unified-runtime/test/conformance/platform/urPlatformGetInfo.cpp index 71a55c93292b2..06b269f54ae2d 100644 --- a/unified-runtime/test/conformance/platform/urPlatformGetInfo.cpp +++ b/unified-runtime/test/conformance/platform/urPlatformGetInfo.cpp @@ -106,7 +106,7 @@ TEST_P(urPlatformGetInfoTest, SuccessBackend) { &property_value, nullptr)); ASSERT_TRUE(property_value >= UR_PLATFORM_BACKEND_LEVEL_ZERO && - property_value <= UR_PLATFORM_BACKEND_NATIVE_CPU); + property_value <= UR_PLATFORM_BACKEND_OFFLOAD); } TEST_P(urPlatformGetInfoTest, SuccessAdapter) { diff --git a/unified-runtime/test/conformance/source/environment.cpp b/unified-runtime/test/conformance/source/environment.cpp index d1fe951e5ce00..8298f662dcc51 100644 --- a/unified-runtime/test/conformance/source/environment.cpp +++ b/unified-runtime/test/conformance/source/environment.cpp @@ -215,6 +215,25 @@ std::string KernelsEnvironment::getTargetName(ur_platform_handle_t platform) { return "nvptx64-nvidia-cuda"; case UR_PLATFORM_BACKEND_HIP: return "amdgcn-amd-amdhsa"; + case UR_PLATFORM_BACKEND_OFFLOAD: { + // All Offload platforms report this backend, use the platform name to select + // the actual underlying backend. + std::vector PlatformName; + size_t PlatformNameSize = 0; + urPlatformGetInfo(platform, UR_PLATFORM_INFO_NAME, 0, nullptr, + &PlatformNameSize); + PlatformName.resize(PlatformNameSize); + urPlatformGetInfo(platform, UR_PLATFORM_INFO_NAME, PlatformNameSize, + PlatformName.data(), nullptr); + if (std::strcmp(PlatformName.data(), "CUDA") == 0) { + return "nvptx64-nvidia-cuda"; + } else if (std::strcmp(PlatformName.data(), "AMDGPU") == 0) { + return "amdgcn-amd-amdhsa"; + } else { + error = "Could not detect target for Offload platform"; + return {}; + } + } case UR_PLATFORM_BACKEND_NATIVE_CPU: error = "native_cpu doesn't support kernel tests yet"; return {}; @@ -297,7 +316,8 @@ void KernelsEnvironment::CreateProgram( sizeof(ur_platform_backend_t), &backend, nullptr)); if (backend == UR_PLATFORM_BACKEND_HIP || - backend == UR_PLATFORM_BACKEND_CUDA) { + backend == UR_PLATFORM_BACKEND_CUDA || + backend == UR_PLATFORM_BACKEND_OFFLOAD) { // The CUDA and HIP adapters do not support urProgramCreateWithIL so we // need to use urProgramCreateWithBinary instead. auto size = binary.size();