From c1460fbd88c912c4e1c1f7987ad683c7c89a5b0e Mon Sep 17 00:00:00 2001
From: Callum Fare <callum@codeplay.com>
Date: Tue, 6 Aug 2024 13:59:29 +0100
Subject: [PATCH 01/11] Add initial Offload adapter implementation

---
 .../source/adapters/CMakeLists.txt            |   5 +
 .../source/adapters/offload/.clang-format     |   4 +
 .../source/adapters/offload/CMakeLists.txt    |  58 +++
 .../source/adapters/offload/adapter.cpp       |  96 ++++
 .../source/adapters/offload/adapter.hpp       |  20 +
 .../source/adapters/offload/common.hpp        |   7 +
 .../source/adapters/offload/context.cpp       |  28 ++
 .../source/adapters/offload/context.hpp       |  19 +
 .../source/adapters/offload/device.cpp        | 159 +++++++
 .../source/adapters/offload/enqueue.cpp       |  51 +++
 .../source/adapters/offload/event.cpp         |  36 ++
 .../source/adapters/offload/event.hpp         |  10 +
 .../source/adapters/offload/kernel.cpp        |  70 +++
 .../source/adapters/offload/kernel.hpp        |  52 +++
 .../source/adapters/offload/platform.cpp      | 108 +++++
 .../source/adapters/offload/program.cpp       | 101 +++++
 .../source/adapters/offload/program.hpp       |  10 +
 .../source/adapters/offload/queue.cpp         |  49 +++
 .../source/adapters/offload/queue.hpp         |  11 +
 .../source/adapters/offload/ur2offload.hpp    |  29 ++
 .../adapters/offload/ur_interface_loader.cpp  | 412 ++++++++++++++++++
 .../source/adapters/offload/usm.cpp           |  53 +++
 22 files changed, 1388 insertions(+)
 create mode 100644 unified-runtime/source/adapters/offload/.clang-format
 create mode 100644 unified-runtime/source/adapters/offload/CMakeLists.txt
 create mode 100644 unified-runtime/source/adapters/offload/adapter.cpp
 create mode 100644 unified-runtime/source/adapters/offload/adapter.hpp
 create mode 100644 unified-runtime/source/adapters/offload/common.hpp
 create mode 100644 unified-runtime/source/adapters/offload/context.cpp
 create mode 100644 unified-runtime/source/adapters/offload/context.hpp
 create mode 100644 unified-runtime/source/adapters/offload/device.cpp
 create mode 100644 unified-runtime/source/adapters/offload/enqueue.cpp
 create mode 100644 unified-runtime/source/adapters/offload/event.cpp
 create mode 100644 unified-runtime/source/adapters/offload/event.hpp
 create mode 100644 unified-runtime/source/adapters/offload/kernel.cpp
 create mode 100644 unified-runtime/source/adapters/offload/kernel.hpp
 create mode 100644 unified-runtime/source/adapters/offload/platform.cpp
 create mode 100644 unified-runtime/source/adapters/offload/program.cpp
 create mode 100644 unified-runtime/source/adapters/offload/program.hpp
 create mode 100644 unified-runtime/source/adapters/offload/queue.cpp
 create mode 100644 unified-runtime/source/adapters/offload/queue.hpp
 create mode 100644 unified-runtime/source/adapters/offload/ur2offload.hpp
 create mode 100644 unified-runtime/source/adapters/offload/ur_interface_loader.cpp
 create mode 100644 unified-runtime/source/adapters/offload/usm.cpp

diff --git a/unified-runtime/source/adapters/CMakeLists.txt b/unified-runtime/source/adapters/CMakeLists.txt
index 56e053d29bc3e..34ba19b6e8859 100644
--- a/unified-runtime/source/adapters/CMakeLists.txt
+++ b/unified-runtime/source/adapters/CMakeLists.txt
@@ -68,9 +68,14 @@ if(UR_BUILD_ADAPTER_OPENCL OR UR_BUILD_ADAPTER_ALL)
     add_ur_adapter_subdirectory(opencl)
     list(APPEND TEMP_LIST "opencl")
 endif()
+
 if(UR_BUILD_ADAPTER_NATIVE_CPU OR UR_BUILD_ADAPTER_ALL)
     add_ur_adapter_subdirectory(native_cpu)
     list(APPEND TEMP_LIST "native_cpu")
 endif()
 
+if(UR_BUILD_ADAPTER_OFFLOAD)
+    add_ur_adapter_subdirectory(offload)
+endif()
+
 set(UR_ADAPTERS_LIST "${TEMP_LIST}" CACHE STRING "" FORCE)
diff --git a/unified-runtime/source/adapters/offload/.clang-format b/unified-runtime/source/adapters/offload/.clang-format
new file mode 100644
index 0000000000000..c8daebc205b34
--- /dev/null
+++ b/unified-runtime/source/adapters/offload/.clang-format
@@ -0,0 +1,4 @@
+---
+Language: Cpp
+BasedOnStyle: LLVM
+...
diff --git a/unified-runtime/source/adapters/offload/CMakeLists.txt b/unified-runtime/source/adapters/offload/CMakeLists.txt
new file mode 100644
index 0000000000000..d3559c2ae7761
--- /dev/null
+++ b/unified-runtime/source/adapters/offload/CMakeLists.txt
@@ -0,0 +1,58 @@
+set(UR_OFFLOAD_ADAPTER_DIR "${CMAKE_CURRENT_SOURCE_DIR}" CACHE PATH "Offload adapter directory") # TODO
+
+set(TARGET_NAME ur_adapter_offload)
+
+set(UR_OFFLOAD_INSTALL_DIR "" CACHE PATH "Path to the directory containing libomptarget.so etc")
+if (UR_OFFLOAD_INSTALL_DIR STREQUAL "")
+    message(FATAL_ERROR "UR_OFFLOAD_INSTALL_DIR must be defined for the Offload adapter")
+endif()
+
+set(UR_OFFLOAD_INCLUDE_DIR "" CACHE PATH "Path to the directory containing LLVM headers")
+if (UR_OFFLOAD_INCLUDE_DIR STREQUAL "")
+    message(FATAL_ERROR "UR_OFFLOAD_INCLUDE_DIR must be defined for the Offload adapter")
+endif()
+
+# For the PTX workaround we need to link with CUDA.
+if (NOT TARGET cudadrv)
+        find_package(CUDA 10.1 REQUIRED)
+        add_library(cudadrv SHARED IMPORTED GLOBAL)
+        set_target_properties(
+                cudadrv PROPERTIES 
+                IMPORTED_LOCATION             ${CUDA_CUDA_LIBRARY}
+                INTERFACE_INCLUDE_DIRECTORIES ${CUDA_INCLUDE_DIRS}
+        )
+endif()
+
+add_ur_adapter(${TARGET_NAME}
+        SHARED
+        ${CMAKE_CURRENT_SOURCE_DIR}/adapter.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/context.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/device.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/enqueue.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/event.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/kernel.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/platform.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/program.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/queue.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/ur2offload.hpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/ur_interface_loader.cpp
+        ${CMAKE_CURRENT_SOURCE_DIR}/usm.cpp
+)
+
+set_target_properties(${TARGET_NAME} PROPERTIES
+        VERSION "${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR}.${PROJECT_VERSION_PATCH}"
+        SOVERSION "${PROJECT_VERSION_MAJOR}"
+)
+
+target_link_libraries(${TARGET_NAME} PRIVATE
+        ${PROJECT_NAME}::headers
+        ${PROJECT_NAME}::common
+        ${PROJECT_NAME}::umf
+        ${UR_OFFLOAD_INSTALL_DIR}/lib/libLLVMOffload.so
+        cudadrv
+)
+
+target_include_directories(${TARGET_NAME} PRIVATE
+        "${UR_OFFLOAD_INCLUDE_DIR}/offload"
+        "${CMAKE_CURRENT_SOURCE_DIR}/../../"
+)
diff --git a/unified-runtime/source/adapters/offload/adapter.cpp b/unified-runtime/source/adapters/offload/adapter.cpp
new file mode 100644
index 0000000000000..bde5dc8e8e6a5
--- /dev/null
+++ b/unified-runtime/source/adapters/offload/adapter.cpp
@@ -0,0 +1,96 @@
+//===----------- adapter.cpp - LLVM Offload Plugin  -----------------------===//
+//
+// Copyright (C) 2024 Intel Corporation
+//
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
+// Exceptions. See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <atomic>
+#include <cstdint>
+#include <OffloadAPI.h>
+#include <unordered_set>
+
+#include "adapter.hpp"
+#include "ur/ur.hpp"
+#include "ur_api.h"
+
+ur_adapter_handle_t_ Adapter{};
+
+// Initialize liboffload and perform the initial platform and device discovery
+ur_result_t ur_adapter_handle_t_::init() {
+  auto Res = olInit();
+
+  // Discover every platform that isn't the host platform.
+  // Use an unordered_set to deduplicate platforms we discover multiple times
+  // from different devices.
+  // Also discover the host device. We only expect one so don't need to worry
+  // about overwriting it.
+  Res = olIterateDevices(
+      [](ol_device_handle_t D, void *UserData) {
+        auto Adapter = static_cast<ur_adapter_handle_t>(UserData);
+        ol_platform_handle_t Platform;
+        olGetDeviceInfo(D, OL_DEVICE_INFO_PLATFORM, sizeof(Platform),
+                        &Platform);
+        ol_platform_backend_t Backend;
+        olGetPlatformInfo(Platform, OL_PLATFORM_INFO_BACKEND, sizeof(Backend),
+                          &Backend);
+        if (Backend == OL_PLATFORM_BACKEND_HOST) {
+          Adapter->HostDevice = D;
+        } else if (Backend != OL_PLATFORM_BACKEND_UNKNOWN) {
+          Adapter->Platforms.insert(Platform);
+        }
+        return false;
+      },
+      this);
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urAdapterGet(
+    uint32_t, ur_adapter_handle_t *phAdapters, uint32_t *pNumAdapters) {
+  if (phAdapters) {
+    if (++Adapter.RefCount == 1) {
+      Adapter.init();
+    }
+    *phAdapters = &Adapter;
+  }
+  if (pNumAdapters) {
+    *pNumAdapters = 1;
+  }
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urAdapterRelease(ur_adapter_handle_t) {
+  if (--Adapter.RefCount == 0) {
+    // This can crash when tracing is enabled.
+    // olShutDown();
+  };
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urAdapterRetain(ur_adapter_handle_t) {
+  Adapter.RefCount++;
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urAdapterGetInfo(ur_adapter_handle_t,
+                                                     ur_adapter_info_t propName,
+                                                     size_t propSize,
+                                                     void *pPropValue,
+                                                     size_t *pPropSizeRet) {
+  UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet);
+
+  switch (propName) {
+  case UR_ADAPTER_INFO_BACKEND:
+    return ReturnValue(UR_ADAPTER_BACKEND_CUDA); // TODO: Return a proper value
+  case UR_ADAPTER_INFO_REFERENCE_COUNT:
+    return ReturnValue(Adapter.RefCount.load());
+  default:
+    return UR_RESULT_ERROR_INVALID_ENUMERATION;
+  }
+
+  return UR_RESULT_SUCCESS;
+}
diff --git a/unified-runtime/source/adapters/offload/adapter.hpp b/unified-runtime/source/adapters/offload/adapter.hpp
new file mode 100644
index 0000000000000..be36aceaa8410
--- /dev/null
+++ b/unified-runtime/source/adapters/offload/adapter.hpp
@@ -0,0 +1,20 @@
+#pragma once
+
+#include <atomic>
+#include <cstdint>
+#include <unordered_set>
+
+#include <OffloadAPI.h>
+
+#include "logger/ur_logger.hpp"
+
+struct ur_adapter_handle_t_ {
+  std::atomic_uint32_t RefCount = 0;
+  logger::Logger &Logger = logger::get_logger("offload");
+  ol_device_handle_t HostDevice = nullptr;
+  std::unordered_set<ol_platform_handle_t> Platforms;
+
+  ur_result_t init();
+};
+
+extern ur_adapter_handle_t_ Adapter;
diff --git a/unified-runtime/source/adapters/offload/common.hpp b/unified-runtime/source/adapters/offload/common.hpp
new file mode 100644
index 0000000000000..69aa6bff11e9f
--- /dev/null
+++ b/unified-runtime/source/adapters/offload/common.hpp
@@ -0,0 +1,7 @@
+#pragma once
+
+#include <atomic>
+
+struct RefCounted {
+  std::atomic_uint32_t RefCount = 1;
+};
diff --git a/unified-runtime/source/adapters/offload/context.cpp b/unified-runtime/source/adapters/offload/context.cpp
new file mode 100644
index 0000000000000..01d015038c3b1
--- /dev/null
+++ b/unified-runtime/source/adapters/offload/context.cpp
@@ -0,0 +1,28 @@
+#include "context.hpp"
+#include <ur_api.h>
+
+UR_APIEXPORT ur_result_t UR_APICALL urContextCreate(
+    uint32_t DeviceCount, const ur_device_handle_t *phDevices,
+    const ur_context_properties_t *, ur_context_handle_t *phContext) {
+  if (DeviceCount > 1) {
+    return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+  }
+
+  auto Ctx = new ur_context_handle_t_(*phDevices);
+  *phContext = Ctx;
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL
+urContextRetain(ur_context_handle_t hContext) {
+  hContext->RefCount++;
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL
+urContextRelease(ur_context_handle_t hContext) {
+  if (--hContext->RefCount == 0) {
+    delete hContext;
+  }
+  return UR_RESULT_SUCCESS;
+}
diff --git a/unified-runtime/source/adapters/offload/context.hpp b/unified-runtime/source/adapters/offload/context.hpp
new file mode 100644
index 0000000000000..9483ec1b4a8b8
--- /dev/null
+++ b/unified-runtime/source/adapters/offload/context.hpp
@@ -0,0 +1,19 @@
+#pragma once
+
+#include <atomic>
+#include <unordered_map>
+#include <ur_api.h>
+#include <OffloadAPI.h>
+
+struct ur_context_handle_t_ {
+  ur_context_handle_t_(ur_device_handle_t hDevice) : Device{hDevice} {
+    urDeviceRetain(Device);
+  }
+  ~ur_context_handle_t_() {
+    urDeviceRelease(Device);
+  }
+
+  ur_device_handle_t Device;
+  std::atomic_uint32_t RefCount;
+  std::unordered_map<void*, ol_alloc_type_t> AllocTypeMap;
+};
diff --git a/unified-runtime/source/adapters/offload/device.cpp b/unified-runtime/source/adapters/offload/device.cpp
new file mode 100644
index 0000000000000..54cc60c6fe110
--- /dev/null
+++ b/unified-runtime/source/adapters/offload/device.cpp
@@ -0,0 +1,159 @@
+#include <OffloadAPI.h>
+#include <ur/ur.hpp>
+#include <ur_api.h>
+
+#include "ur2offload.hpp"
+
+UR_APIEXPORT ur_result_t UR_APICALL urDeviceGet(ur_platform_handle_t hPlatform,
+                                                ur_device_type_t,
+                                                uint32_t NumEntries,
+                                                ur_device_handle_t *phDevices,
+                                                uint32_t *pNumDevices) {
+
+  uint32_t NumDevices = 0;
+  // Pass a few things to the callback (we can't use a lambda with captures)
+  using ParamsT = struct {
+    uint32_t DeviceLimit;
+    uint32_t &NumDevices;
+    ol_platform_handle_t Platform;
+    ol_device_handle_t *DevicesOut;
+  };
+  ParamsT Params = {NumEntries, NumDevices,
+                    reinterpret_cast<ol_platform_handle_t>(hPlatform),
+                    reinterpret_cast<ol_device_handle_t *>(phDevices)};
+
+  olIterateDevices(
+      [](ol_device_handle_t D, void *Data) {
+        auto Params = reinterpret_cast<ParamsT *>(Data);
+        ol_platform_handle_t Platform = nullptr;
+        olGetDeviceInfo(D, OL_DEVICE_INFO_PLATFORM, sizeof(Platform),
+                        &Platform);
+        if (Platform == Params->Platform) {
+          if (Params->DevicesOut) {
+            Params->DevicesOut[Params->NumDevices] = D;
+          }
+          Params->NumDevices++;
+        }
+        return Params->NumDevices == Params->DeviceLimit;
+      },
+      &Params);
+
+  if (pNumDevices) {
+    *pNumDevices = NumDevices;
+  }
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
+                                                    ur_device_info_t propName,
+                                                    size_t propSize,
+                                                    void *pPropValue,
+                                                    size_t *pPropSizeRet) {
+  UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet);
+
+  ol_device_info_t olInfo;
+  switch (propName) {
+  case UR_DEVICE_INFO_NAME:
+    olInfo = OL_DEVICE_INFO_NAME;
+    break;
+  case UR_DEVICE_INFO_PARENT_DEVICE:
+    return ReturnValue(nullptr);
+  case UR_DEVICE_INFO_VERSION:
+    return ReturnValue("");
+  case UR_DEVICE_INFO_EXTENSIONS:
+    return ReturnValue("");
+  case UR_DEVICE_INFO_USE_NATIVE_ASSERT:
+    return ReturnValue(false);
+  case UR_DEVICE_INFO_TYPE:
+    olInfo = OL_DEVICE_INFO_TYPE;
+    break;
+  case UR_DEVICE_INFO_VENDOR:
+    olInfo = OL_DEVICE_INFO_VENDOR;
+    break;
+  case UR_DEVICE_INFO_DRIVER_VERSION:
+    olInfo = OL_DEVICE_INFO_DRIVER_VERSION;
+    break;
+  case UR_DEVICE_INFO_PLATFORM:
+    olInfo = OL_DEVICE_INFO_PLATFORM;
+    break;
+  case UR_DEVICE_INFO_USM_SINGLE_SHARED_SUPPORT:
+    return ReturnValue(UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS);
+  case UR_DEVICE_INFO_BUILD_ON_SUBDEVICE:
+    return ReturnValue(false);
+  default:
+    return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
+  }
+
+  if (pPropSizeRet) {
+    if (auto Res =
+            olGetDeviceInfoSize(reinterpret_cast<ol_device_handle_t>(hDevice),
+                                olInfo, pPropSizeRet)) {
+      return offloadResultToUR(Res);
+    }
+  }
+
+  if (pPropValue) {
+    if (auto Res =
+            olGetDeviceInfo(reinterpret_cast<ol_device_handle_t>(hDevice),
+                            olInfo, propSize, pPropValue)) {
+      return offloadResultToUR(Res);
+    }
+    // Need to explicitly map this type
+    if (olInfo == OL_DEVICE_INFO_TYPE) {
+      auto urPropPtr = reinterpret_cast<ur_device_type_t *>(pPropValue);
+      auto olPropPtr = reinterpret_cast<ol_device_type_t *>(pPropValue);
+
+      switch (*olPropPtr) {
+      case OL_DEVICE_TYPE_CPU:
+        *urPropPtr = UR_DEVICE_TYPE_CPU;
+        break;
+      case OL_DEVICE_TYPE_GPU:
+        *urPropPtr = UR_DEVICE_TYPE_GPU;
+        break;
+      default:
+        break;
+      }
+    }
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+// Device partitioning is not supported in Offload, and won't be for some time.
+// This means urDeviceRetain/Release are no-ops because all devices are root
+// devices.
+
+UR_APIEXPORT ur_result_t UR_APICALL urDeviceRetain(ur_device_handle_t) {
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urDeviceRelease(ur_device_handle_t) {
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL
+urDevicePartition(ur_device_handle_t, const ur_device_partition_properties_t *,
+                  uint32_t, ur_device_handle_t *, uint32_t *) {
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urDeviceSelectBinary(
+    ur_device_handle_t hDevice, const ur_device_binary_t *pBinaries,
+    uint32_t NumBinaries, uint32_t *pSelectedBinary) {
+  std::ignore = hDevice;
+  std::ignore = pBinaries;
+  std::ignore = NumBinaries;
+  std::ignore = pSelectedBinary;
+
+  // TODO: Don't hard code nvptx64!
+  const char *image_target = UR_DEVICE_BINARY_TARGET_NVPTX64;
+  for (uint32_t i = 0; i < NumBinaries; ++i) {
+    if (strcmp(pBinaries[i].pDeviceTargetSpec, image_target) == 0) {
+      *pSelectedBinary = i;
+      return UR_RESULT_SUCCESS;
+    }
+  }
+
+  // No image can be loaded for the given device
+  return UR_RESULT_ERROR_INVALID_BINARY;
+}
diff --git a/unified-runtime/source/adapters/offload/enqueue.cpp b/unified-runtime/source/adapters/offload/enqueue.cpp
new file mode 100644
index 0000000000000..30f5a099429fa
--- /dev/null
+++ b/unified-runtime/source/adapters/offload/enqueue.cpp
@@ -0,0 +1,51 @@
+#include <OffloadAPI.h>
+#include <assert.h>
+#include <ur_api.h>
+
+#include "event.hpp"
+#include "kernel.hpp"
+#include "queue.hpp"
+#include "ur2offload.hpp"
+
+UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
+    ur_queue_handle_t hQueue, ur_kernel_handle_t hKernel, uint32_t workDim,
+    const size_t *pGlobalWorkOffset, const size_t *pGlobalWorkSize,
+    const size_t *pLocalWorkSize, uint32_t numEventsInWaitList,
+    const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
+  // Ignore wait list for now
+  (void)numEventsInWaitList;
+  (void)phEventWaitList;
+  //
+
+  (void) pGlobalWorkOffset;
+  (void) pLocalWorkSize;
+
+  assert(workDim == 1);
+
+  ol_kernel_launch_size_args_t LaunchArgs;
+  LaunchArgs.Dimensions = workDim;
+  LaunchArgs.NumGroupsX = pGlobalWorkSize[0];
+  LaunchArgs.NumGroupsY = 1;
+  LaunchArgs.NumGroupsZ = 1;
+  LaunchArgs.GroupSizeX = 1;
+  LaunchArgs.GroupSizeY = 1;
+  LaunchArgs.GroupSizeZ = 1;
+  LaunchArgs.DynSharedMemory = 0;
+
+  ol_event_handle_t EventOut;
+  auto Ret =
+      olLaunchKernel(hQueue->OffloadQueue, hQueue->OffloadDevice,
+                     hKernel->OffloadKernel, hKernel->Args.getStorage(),
+                     hKernel->Args.getStorageSize(), &LaunchArgs, &EventOut);
+
+  if (Ret != OL_SUCCESS) {
+    return offloadResultToUR(Ret);
+  }
+
+  if (phEvent) {
+    auto *Event = new ur_event_handle_t_();
+    Event->OffloadEvent = EventOut;
+    *phEvent = Event;
+  }
+  return UR_RESULT_SUCCESS;
+}
diff --git a/unified-runtime/source/adapters/offload/event.cpp b/unified-runtime/source/adapters/offload/event.cpp
new file mode 100644
index 0000000000000..5dec5fa29d113
--- /dev/null
+++ b/unified-runtime/source/adapters/offload/event.cpp
@@ -0,0 +1,36 @@
+#include <OffloadAPI.h>
+#include <ur_api.h>
+
+#include "event.hpp"
+#include "ur2offload.hpp"
+
+UR_APIEXPORT ur_result_t UR_APICALL
+urEventWait(uint32_t numEvents, const ur_event_handle_t *phEventWaitList) {
+  for (uint32_t i = 0; i < numEvents; i++) {
+    auto Res = olWaitEvent(phEventWaitList[i]->OffloadEvent);
+    if (Res) {
+      return offloadResultToUR(Res);
+    }
+  }
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEventRetain(ur_event_handle_t hEvent) {
+  hEvent->RefCount++;
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urEventRelease(ur_event_handle_t hEvent) {
+  if (--hEvent->RefCount == 0) {
+    // There's a small bug in olDestroyEvent that will crash. Leak the event
+    // in the meantime.
+    // auto Res = olDestroyEvent(hEvent->OffloadEvent);
+    // if (Res) {
+    //   return offloadResultToUR(Res);
+    // }
+  }
+
+  delete hEvent;
+  return UR_RESULT_SUCCESS;
+}
diff --git a/unified-runtime/source/adapters/offload/event.hpp b/unified-runtime/source/adapters/offload/event.hpp
new file mode 100644
index 0000000000000..95f692214e6f1
--- /dev/null
+++ b/unified-runtime/source/adapters/offload/event.hpp
@@ -0,0 +1,10 @@
+#pragma once
+
+#include <ur_api.h>
+#include <OffloadAPI.h>
+
+#include "common.hpp"
+
+struct ur_event_handle_t_ : RefCounted {
+ ol_event_handle_t OffloadEvent;
+};
diff --git a/unified-runtime/source/adapters/offload/kernel.cpp b/unified-runtime/source/adapters/offload/kernel.cpp
new file mode 100644
index 0000000000000..6ab95aa6640da
--- /dev/null
+++ b/unified-runtime/source/adapters/offload/kernel.cpp
@@ -0,0 +1,70 @@
+#include "kernel.hpp"
+#include "program.hpp"
+#include "ur2offload.hpp"
+#include <OffloadAPI.h>
+#include <ur/ur.hpp>
+#include <ur_api.h>
+
+UR_APIEXPORT ur_result_t UR_APICALL
+urKernelCreate(ur_program_handle_t hProgram, const char *pKernelName,
+               ur_kernel_handle_t *phKernel) {
+  ur_kernel_handle_t Kernel = new ur_kernel_handle_t_;
+
+  auto Res = olGetKernel(hProgram->OffloadProgram, pKernelName,
+                         &Kernel->OffloadKernel);
+
+  if (Res != OL_SUCCESS) {
+    delete Kernel;
+    return offloadResultToUR(Res);
+  }
+
+  *phKernel = Kernel;
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urKernelRetain(ur_kernel_handle_t hKernel) {
+  hKernel->RefCount++;
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL
+urKernelRelease(ur_kernel_handle_t hKernel) {
+  if (--hKernel->RefCount == 0) {
+    delete hKernel;
+  }
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL
+urKernelSetExecInfo(ur_kernel_handle_t, ur_kernel_exec_info_t, size_t,
+                    const ur_kernel_exec_info_properties_t *, const void *) {
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgPointer(
+    ur_kernel_handle_t hKernel, uint32_t argIndex,
+    const ur_kernel_arg_pointer_properties_t *, const void *pArgValue) {
+  hKernel->Args.addArg(argIndex, sizeof(pArgValue), &pArgValue);
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgValue(
+    ur_kernel_handle_t hKernel, uint32_t argIndex, size_t argSize,
+    const ur_kernel_arg_value_properties_t *, const void *pArgValue) {
+  hKernel->Args.addArg(argIndex, argSize, pArgValue);
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL
+urKernelGetGroupInfo(ur_kernel_handle_t, ur_device_handle_t,
+                     ur_kernel_group_info_t propName, size_t propSize,
+                     void *pPropValue, size_t *pPropSizeRet) {
+  UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet);
+
+  if (propName == UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE) {
+    size_t GroupSize[3] = {0, 0, 0};
+    return ReturnValue(GroupSize, 3);
+  }
+  return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
+}
diff --git a/unified-runtime/source/adapters/offload/kernel.hpp b/unified-runtime/source/adapters/offload/kernel.hpp
new file mode 100644
index 0000000000000..dee293aaa1b44
--- /dev/null
+++ b/unified-runtime/source/adapters/offload/kernel.hpp
@@ -0,0 +1,52 @@
+#pragma once
+
+#include <OffloadAPI.h>
+#include <array>
+#include <cstring>
+#include <numeric>
+#include <ur_api.h>
+#include <vector>
+
+#include "common.hpp"
+
+struct ur_kernel_handle_t_ : RefCounted {
+
+  // Simplified version of the CUDA adapter's argument implementation
+  struct OffloadKernelArguments {
+    static constexpr size_t MaxParamBytes = 4096u;
+    using args_t = std::array<char, MaxParamBytes>;
+    using args_size_t = std::vector<size_t>;
+    using args_ptr_t = std::vector<void *>;
+    args_t Storage;
+    size_t StorageUsed = 0;
+    args_size_t ParamSizes;
+    args_ptr_t Pointers;
+
+    // Add an argument. If it already exists, it is replaced. Gaps are filled
+    // with empty arguments.
+    void addArg(size_t Index, size_t Size, const void *Arg) {
+      if (Index + 1 > Pointers.size()) {
+        Pointers.resize(Index + 1);
+        ParamSizes.resize(Index + 1);
+      }
+      ParamSizes[Index] = Size;
+      // Calculate the insertion point in the array.
+      size_t InsertPos = std::accumulate(std::begin(ParamSizes),
+                                         std::begin(ParamSizes) + Index, 0);
+      // Update the stored value for the argument.
+      std::memcpy(&Storage[InsertPos], Arg, Size);
+      Pointers[Index] = &Storage[InsertPos];
+    }
+
+    const args_ptr_t &getPointers() const noexcept { return Pointers; }
+
+    const char *getStorage() const noexcept { return Storage.data(); }
+
+    size_t getStorageSize() const noexcept {
+      return std::accumulate(std::begin(ParamSizes), std::end(ParamSizes), 0);
+    }
+  };
+
+  ol_kernel_handle_t OffloadKernel;
+  OffloadKernelArguments Args{};
+};
diff --git a/unified-runtime/source/adapters/offload/platform.cpp b/unified-runtime/source/adapters/offload/platform.cpp
new file mode 100644
index 0000000000000..50b9ac90649ce
--- /dev/null
+++ b/unified-runtime/source/adapters/offload/platform.cpp
@@ -0,0 +1,108 @@
+#include <OffloadAPI.h>
+#include <unordered_set>
+#include <ur/ur.hpp>
+#include <ur_api.h>
+
+#include "adapter.hpp"
+#include "ur2offload.hpp"
+
+UR_APIEXPORT ur_result_t UR_APICALL
+urPlatformGet(ur_adapter_handle_t, uint32_t NumEntries,
+              ur_platform_handle_t *phPlatforms, uint32_t *pNumPlatforms) {
+
+  if (pNumPlatforms) {
+    *pNumPlatforms = Adapter.Platforms.size();
+  }
+
+  if (phPlatforms) {
+    size_t PlatformIndex = 0;
+    for (auto &Platform : Adapter.Platforms) {
+      phPlatforms[PlatformIndex++] =
+          reinterpret_cast<ur_platform_handle_t>(Platform);
+      if (PlatformIndex == NumEntries) {
+        break;
+      }
+    }
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL
+urPlatformGetInfo(ur_platform_handle_t hPlatform, ur_platform_info_t propName,
+                  size_t propSize, void *pPropValue, size_t *pPropSizeRet) {
+  UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet);
+
+  ol_platform_info_t olInfo;
+  switch (propName) {
+  case UR_PLATFORM_INFO_NAME:
+    olInfo = OL_PLATFORM_INFO_NAME;
+    break;
+  case UR_PLATFORM_INFO_VENDOR_NAME:
+    olInfo = OL_PLATFORM_INFO_VENDOR_NAME;
+    break;
+  case UR_PLATFORM_INFO_VERSION:
+    olInfo = OL_PLATFORM_INFO_VERSION;
+    break;
+  case UR_PLATFORM_INFO_EXTENSIONS:
+    return ReturnValue("");
+  case UR_PLATFORM_INFO_PROFILE:
+    return ReturnValue("FULL_PROFILE");
+  case UR_PLATFORM_INFO_BACKEND:
+    olInfo = OL_PLATFORM_INFO_BACKEND;
+    break;
+  default:
+    return UR_RESULT_ERROR_INVALID_ENUMERATION;
+  }
+
+  if (pPropSizeRet) {
+    if (auto Res = olGetPlatformInfoSize(
+            reinterpret_cast<ol_platform_handle_t>(hPlatform), olInfo,
+            pPropSizeRet)) {
+      return offloadResultToUR(Res);
+    }
+  }
+
+  if (pPropValue) {
+    if (auto Res =
+            olGetPlatformInfo(reinterpret_cast<ol_platform_handle_t>(hPlatform),
+                              olInfo, propSize, pPropValue)) {
+      return offloadResultToUR(Res);
+    }
+
+    // Need to explicitly map this type
+    if (olInfo == OL_PLATFORM_INFO_BACKEND) {
+      auto urPropPtr = reinterpret_cast<ur_platform_backend_t *>(pPropValue);
+      auto olPropPtr = reinterpret_cast<ol_platform_backend_t *>(pPropValue);
+
+      switch (*olPropPtr) {
+      case OL_PLATFORM_BACKEND_CUDA:
+        *urPropPtr = UR_PLATFORM_BACKEND_CUDA;
+        break;
+      case OL_PLATFORM_BACKEND_AMDGPU:
+        *urPropPtr = UR_PLATFORM_BACKEND_HIP;
+        break;
+      default:
+        *urPropPtr = UR_PLATFORM_BACKEND_UNKNOWN;
+        break;
+      }
+    }
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL
+urPlatformGetBackendOption(ur_platform_handle_t, const char *pFrontendOption,
+                           const char **ppPlatformOption) {
+  using namespace std::literals;
+  if (pFrontendOption == nullptr)
+    return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+  if (pFrontendOption == "-O0"sv || pFrontendOption == "-O1"sv ||
+      pFrontendOption == "-O2"sv || pFrontendOption == "-O3"sv ||
+      pFrontendOption == ""sv) {
+    *ppPlatformOption = "";
+    return UR_RESULT_SUCCESS;
+  }
+  return UR_RESULT_ERROR_INVALID_VALUE;
+}
diff --git a/unified-runtime/source/adapters/offload/program.cpp b/unified-runtime/source/adapters/offload/program.cpp
new file mode 100644
index 0000000000000..a55644efdd4f7
--- /dev/null
+++ b/unified-runtime/source/adapters/offload/program.cpp
@@ -0,0 +1,101 @@
+#include <OffloadAPI.h>
+#include <ur/ur.hpp>
+#include <ur_api.h>
+#include <cuda.h>
+
+#include "context.hpp"
+#include "program.hpp"
+#include "ur2offload.hpp"
+
+UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary(
+    ur_context_handle_t hContext, uint32_t numDevices,
+    ur_device_handle_t *phDevices, size_t *pLengths, const uint8_t **ppBinaries,
+    const ur_program_properties_t *, ur_program_handle_t *phProgram) {
+  if (numDevices > 1) {
+    return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+  }
+
+  // Workaround for Offload not supporting PTX binaries. Force CUDA programs
+  // to be linked so they end up as CUBIN.
+  uint8_t *RealBinary;
+  size_t RealLength;
+  ur_platform_handle_t DevicePlatform;
+  bool DidLink = false;
+  CUlinkState State;
+  urDeviceGetInfo(phDevices[0], UR_DEVICE_INFO_PLATFORM,
+                  sizeof(ur_platform_handle_t), &DevicePlatform, nullptr);
+  ur_platform_backend_t PlatformBackend;
+  urPlatformGetInfo(DevicePlatform, UR_PLATFORM_INFO_BACKEND,
+                    sizeof(ur_platform_backend_t), &PlatformBackend, nullptr);
+  if (PlatformBackend == UR_PLATFORM_BACKEND_CUDA) {
+    cuLinkCreate(0, nullptr, nullptr, &State);
+
+    cuLinkAddData(State, CU_JIT_INPUT_PTX, (char *)(ppBinaries[0]), pLengths[0],
+                  nullptr, 0, nullptr, nullptr);
+
+    void *CuBin = nullptr;
+    size_t CuBinSize = 0;
+    cuLinkComplete(State, &CuBin, &CuBinSize);
+    RealBinary = (uint8_t *)CuBin;
+    RealLength = CuBinSize;
+    DidLink = true;
+    fprintf(stderr, "Performed CUDA bin workaround (size = %lu)\n", RealLength);
+  } else {
+    RealBinary = const_cast<uint8_t *>(ppBinaries[0]);
+    RealLength = pLengths[0];
+  }
+
+  ur_program_handle_t Program = new ur_program_handle_t_();
+  auto Res =
+      olCreateProgram(reinterpret_cast<ol_device_handle_t>(hContext->Device),
+                      RealBinary, RealLength, &Program->OffloadProgram);
+
+  // Program owns the linked module now
+  if (DidLink) {
+    cuLinkDestroy(State);
+  }
+
+  if (Res != OL_SUCCESS) {
+    delete Program;
+    return offloadResultToUR(Res);
+  }
+
+  *phProgram = Program;
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urProgramBuild(ur_context_handle_t,
+                                                   ur_program_handle_t,
+                                                   const char *) {
+  // Do nothing, program is built upon creation
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urProgramBuildExp(ur_program_handle_t,
+                                                      uint32_t,
+                                                      ur_device_handle_t *,
+                                                      const char *) {
+  // Do nothing, program is built upon creation
+  return UR_RESULT_SUCCESS;
+}
+
+
+UR_APIEXPORT ur_result_t UR_APICALL
+urProgramRetain(ur_program_handle_t hProgram) {
+  hProgram->RefCount++;
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL
+urProgramRelease(ur_program_handle_t hProgram) {
+  if (--hProgram->RefCount == 0) {
+    auto Res = olDestroyProgram(hProgram->OffloadProgram);
+    if (Res) {
+      return offloadResultToUR(Res);
+    }
+    delete hProgram;
+  }
+
+  return UR_RESULT_SUCCESS;
+}
diff --git a/unified-runtime/source/adapters/offload/program.hpp b/unified-runtime/source/adapters/offload/program.hpp
new file mode 100644
index 0000000000000..0639ab336c5fb
--- /dev/null
+++ b/unified-runtime/source/adapters/offload/program.hpp
@@ -0,0 +1,10 @@
+#pragma once
+
+#include <ur_api.h>
+#include <OffloadAPI.h>
+
+#include "common.hpp"
+
+struct ur_program_handle_t_ : RefCounted {
+ ol_program_handle_t OffloadProgram;
+};
diff --git a/unified-runtime/source/adapters/offload/queue.cpp b/unified-runtime/source/adapters/offload/queue.cpp
new file mode 100644
index 0000000000000..2900deab68c60
--- /dev/null
+++ b/unified-runtime/source/adapters/offload/queue.cpp
@@ -0,0 +1,49 @@
+#include <OffloadAPI.h>
+#include <ur/ur.hpp>
+#include <ur_api.h>
+
+#include "context.hpp"
+#include "queue.hpp"
+#include "ur2offload.hpp"
+
+UR_APIEXPORT ur_result_t UR_APICALL urQueueCreate(
+    [[maybe_unused]] ur_context_handle_t hContext, ur_device_handle_t hDevice,
+    const ur_queue_properties_t *, ur_queue_handle_t *phQueue) {
+
+  assert(hContext->Device == hDevice);
+
+  ur_queue_handle_t Queue = new ur_queue_handle_t_();
+  auto Res = olCreateQueue(reinterpret_cast<ol_device_handle_t>(hDevice),
+                           &Queue->OffloadQueue);
+  if (Res != OL_SUCCESS) {
+    delete Queue;
+    return offloadResultToUR(Res);
+  }
+
+  Queue->OffloadDevice = reinterpret_cast<ol_device_handle_t>(hDevice);
+
+  *phQueue = Queue;
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urQueueRetain(ur_queue_handle_t hQueue) {
+  hQueue->RefCount++;
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urQueueRelease(ur_queue_handle_t hQueue) {
+  if (--hQueue->RefCount == 0) {
+    auto Res = olDestroyQueue(hQueue->OffloadQueue);
+    if (Res) {
+      return offloadResultToUR(Res);
+    }
+    delete hQueue;
+  }
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urQueueFinish(ur_queue_handle_t hQueue) {
+  return offloadResultToUR(olWaitQueue(hQueue->OffloadQueue));
+}
diff --git a/unified-runtime/source/adapters/offload/queue.hpp b/unified-runtime/source/adapters/offload/queue.hpp
new file mode 100644
index 0000000000000..9406d460b7401
--- /dev/null
+++ b/unified-runtime/source/adapters/offload/queue.hpp
@@ -0,0 +1,11 @@
+#pragma once
+
+#include <ur_api.h>
+#include <OffloadAPI.h>
+
+#include "common.hpp"
+
+struct ur_queue_handle_t_ : RefCounted {
+ ol_queue_handle_t OffloadQueue;
+ ol_device_handle_t OffloadDevice;
+};
diff --git a/unified-runtime/source/adapters/offload/ur2offload.hpp b/unified-runtime/source/adapters/offload/ur2offload.hpp
new file mode 100644
index 0000000000000..2e9835bc480d0
--- /dev/null
+++ b/unified-runtime/source/adapters/offload/ur2offload.hpp
@@ -0,0 +1,29 @@
+//===--------- ur2offload.hpp - LLVM Offload Adapter ----------------------===//
+//
+// Copyright (C) 2024 Intel Corporation
+//
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
+// Exceptions. See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <OffloadAPI.h>
+#include <ur_api.h>
+
+inline ur_result_t offloadResultToUR(ol_result_t Result) {
+  if (Result == OL_SUCCESS) {
+    return UR_RESULT_SUCCESS;
+  }
+
+  switch (Result->Code) {
+  case OL_ERRC_INVALID_NULL_HANDLE:
+    return UR_RESULT_ERROR_INVALID_NULL_HANDLE;
+  case OL_ERRC_INVALID_NULL_POINTER:
+    return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+  case OL_ERRC_UNSUPPORTED_ENUMERATION:
+    return UR_RESULT_ERROR_UNSUPPORTED_ENUMERATION;
+  default:
+    return UR_RESULT_ERROR_UNKNOWN;
+  }
+}
diff --git a/unified-runtime/source/adapters/offload/ur_interface_loader.cpp b/unified-runtime/source/adapters/offload/ur_interface_loader.cpp
new file mode 100644
index 0000000000000..789bd653ea4bb
--- /dev/null
+++ b/unified-runtime/source/adapters/offload/ur_interface_loader.cpp
@@ -0,0 +1,412 @@
+//===----------- ur_interface_loader.cpp - LLVM Offload Plugin  -----------===//
+//
+// Copyright (C) 2024 Intel Corporation
+//
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
+// Exceptions. See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <ur_api.h>
+#include <ur_ddi.h>
+
+namespace {
+
+// TODO - this is a duplicate of what is in the L0 plugin
+// We should move this to somewhere common
+ur_result_t validateProcInputs(ur_api_version_t version, void *pDdiTable) {
+  if (pDdiTable == nullptr) {
+    return UR_RESULT_ERROR_INVALID_NULL_POINTER;
+  }
+  // Pre 1.0 we enforce that loader and adapter must have the same version.
+  // Post 1.0 only a major version match should be required.
+  if (version != UR_API_VERSION_CURRENT) {
+    return UR_RESULT_ERROR_UNSUPPORTED_VERSION;
+  }
+  return UR_RESULT_SUCCESS;
+}
+} // namespace
+
+extern "C" {
+
+UR_DLLEXPORT ur_result_t UR_APICALL urGetPlatformProcAddrTable(
+    ur_api_version_t version, ur_platform_dditable_t *pDdiTable) {
+  auto result = validateProcInputs(version, pDdiTable);
+  if (UR_RESULT_SUCCESS != result) {
+    return result;
+  }
+  pDdiTable->pfnCreateWithNativeHandle = nullptr;
+  pDdiTable->pfnGet = urPlatformGet;
+  pDdiTable->pfnGetApiVersion = nullptr;
+  pDdiTable->pfnGetInfo = urPlatformGetInfo;
+  pDdiTable->pfnGetNativeHandle = nullptr;
+  pDdiTable->pfnGetBackendOption = urPlatformGetBackendOption;
+  return UR_RESULT_SUCCESS;
+}
+
+UR_DLLEXPORT ur_result_t UR_APICALL urGetContextProcAddrTable(
+    ur_api_version_t version, ur_context_dditable_t *pDdiTable) {
+  auto result = validateProcInputs(version, pDdiTable);
+  if (UR_RESULT_SUCCESS != result) {
+    return result;
+  }
+  pDdiTable->pfnCreate = urContextCreate;
+  pDdiTable->pfnCreateWithNativeHandle = nullptr;
+  pDdiTable->pfnGetInfo = nullptr;
+  pDdiTable->pfnGetNativeHandle = nullptr;
+  pDdiTable->pfnRelease = urContextRelease;
+  pDdiTable->pfnRetain = urContextRetain;
+  pDdiTable->pfnSetExtendedDeleter = nullptr;
+  return UR_RESULT_SUCCESS;
+}
+
+UR_DLLEXPORT ur_result_t UR_APICALL urGetEventProcAddrTable(
+    ur_api_version_t version, ur_event_dditable_t *pDdiTable) {
+  auto result = validateProcInputs(version, pDdiTable);
+  if (UR_RESULT_SUCCESS != result) {
+    return result;
+  }
+  pDdiTable->pfnCreateWithNativeHandle = nullptr;
+  pDdiTable->pfnGetInfo = nullptr;
+  pDdiTable->pfnGetNativeHandle = nullptr;
+  pDdiTable->pfnGetProfilingInfo = nullptr;
+  pDdiTable->pfnRelease = urEventRelease;
+  pDdiTable->pfnRetain = urEventRetain;
+  pDdiTable->pfnSetCallback = nullptr;
+  pDdiTable->pfnWait = urEventWait;
+  return UR_RESULT_SUCCESS;
+}
+
+UR_DLLEXPORT ur_result_t UR_APICALL urGetProgramProcAddrTable(
+    ur_api_version_t version, ur_program_dditable_t *pDdiTable) {
+  auto result = validateProcInputs(version, pDdiTable);
+  if (UR_RESULT_SUCCESS != result) {
+    return result;
+  }
+  pDdiTable->pfnBuild = urProgramBuild;
+  pDdiTable->pfnCompile = nullptr;
+  pDdiTable->pfnCreateWithBinary = urProgramCreateWithBinary;
+  pDdiTable->pfnCreateWithIL = nullptr;
+  pDdiTable->pfnCreateWithNativeHandle = nullptr;
+  pDdiTable->pfnGetBuildInfo = nullptr;
+  pDdiTable->pfnGetFunctionPointer = nullptr;
+  pDdiTable->pfnGetGlobalVariablePointer = nullptr;
+  pDdiTable->pfnGetInfo = nullptr;
+  pDdiTable->pfnGetNativeHandle = nullptr;
+  pDdiTable->pfnLink = nullptr;
+  pDdiTable->pfnRelease = urProgramRelease;
+  pDdiTable->pfnRetain = urProgramRetain;
+  pDdiTable->pfnSetSpecializationConstants = nullptr;
+  return UR_RESULT_SUCCESS;
+}
+
+UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelProcAddrTable(
+    ur_api_version_t version, ur_kernel_dditable_t *pDdiTable) {
+  auto result = validateProcInputs(version, pDdiTable);
+  if (UR_RESULT_SUCCESS != result) {
+    return result;
+  }
+  pDdiTable->pfnCreate = urKernelCreate;
+  pDdiTable->pfnCreateWithNativeHandle = nullptr;
+  pDdiTable->pfnGetGroupInfo = urKernelGetGroupInfo;
+  pDdiTable->pfnGetInfo = nullptr;
+  pDdiTable->pfnGetNativeHandle = nullptr;
+  pDdiTable->pfnGetSubGroupInfo = nullptr;
+  pDdiTable->pfnRelease = urKernelRelease;
+  pDdiTable->pfnRetain = urKernelRetain;
+  pDdiTable->pfnSetArgLocal = nullptr;
+  pDdiTable->pfnSetArgMemObj = nullptr;
+  pDdiTable->pfnSetArgPointer = urKernelSetArgPointer;
+  pDdiTable->pfnSetArgSampler = nullptr;
+  pDdiTable->pfnSetArgValue = urKernelSetArgValue;
+  pDdiTable->pfnSetExecInfo = urKernelSetExecInfo;
+  pDdiTable->pfnSetSpecializationConstants = nullptr;
+  pDdiTable->pfnGetSuggestedLocalWorkSize = nullptr;
+  return UR_RESULT_SUCCESS;
+}
+
+UR_DLLEXPORT ur_result_t UR_APICALL urGetSamplerProcAddrTable(
+    ur_api_version_t version, ur_sampler_dditable_t *pDdiTable) {
+  auto result = validateProcInputs(version, pDdiTable);
+  if (UR_RESULT_SUCCESS != result) {
+    return result;
+  }
+  pDdiTable->pfnCreate = nullptr;
+  pDdiTable->pfnCreateWithNativeHandle = nullptr;
+  pDdiTable->pfnGetInfo = nullptr;
+  pDdiTable->pfnGetNativeHandle = nullptr;
+  pDdiTable->pfnRelease = nullptr;
+  pDdiTable->pfnRetain = nullptr;
+  return UR_RESULT_SUCCESS;
+}
+
+UR_DLLEXPORT ur_result_t UR_APICALL
+urGetMemProcAddrTable(ur_api_version_t version, ur_mem_dditable_t *pDdiTable) {
+  auto result = validateProcInputs(version, pDdiTable);
+  if (UR_RESULT_SUCCESS != result) {
+    return result;
+  }
+  pDdiTable->pfnBufferCreate = nullptr;
+  pDdiTable->pfnBufferPartition = nullptr;
+  pDdiTable->pfnBufferCreateWithNativeHandle = nullptr;
+  pDdiTable->pfnImageCreateWithNativeHandle = nullptr;
+  pDdiTable->pfnGetInfo = nullptr;
+  pDdiTable->pfnGetNativeHandle = nullptr;
+  pDdiTable->pfnImageCreate = nullptr;
+  pDdiTable->pfnImageGetInfo = nullptr;
+  pDdiTable->pfnRelease = nullptr;
+  pDdiTable->pfnRetain = nullptr;
+  return UR_RESULT_SUCCESS;
+}
+
+UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueProcAddrTable(
+    ur_api_version_t version, ur_enqueue_dditable_t *pDdiTable) {
+  auto result = validateProcInputs(version, pDdiTable);
+  if (UR_RESULT_SUCCESS != result) {
+    return result;
+  }
+  pDdiTable->pfnDeviceGlobalVariableRead = nullptr;
+  pDdiTable->pfnDeviceGlobalVariableWrite = nullptr;
+  pDdiTable->pfnEventsWait = nullptr;
+  pDdiTable->pfnEventsWaitWithBarrier = nullptr;
+  pDdiTable->pfnKernelLaunch = urEnqueueKernelLaunch;
+  pDdiTable->pfnMemBufferCopy = nullptr;
+  pDdiTable->pfnMemBufferCopyRect = nullptr;
+  pDdiTable->pfnMemBufferFill = nullptr;
+  pDdiTable->pfnMemBufferMap = nullptr;
+  pDdiTable->pfnMemBufferRead = nullptr;
+  pDdiTable->pfnMemBufferReadRect = nullptr;
+  pDdiTable->pfnMemBufferWrite = nullptr;
+  pDdiTable->pfnMemBufferWriteRect = nullptr;
+  pDdiTable->pfnMemImageCopy = nullptr;
+  pDdiTable->pfnMemImageRead = nullptr;
+  pDdiTable->pfnMemImageWrite = nullptr;
+  pDdiTable->pfnMemUnmap = nullptr;
+  pDdiTable->pfnUSMFill2D = nullptr;
+  pDdiTable->pfnUSMFill = nullptr;
+  pDdiTable->pfnUSMAdvise = nullptr;
+  pDdiTable->pfnUSMMemcpy2D = nullptr;
+  pDdiTable->pfnUSMMemcpy = nullptr;
+  pDdiTable->pfnUSMPrefetch = nullptr;
+  pDdiTable->pfnReadHostPipe = nullptr;
+  pDdiTable->pfnWriteHostPipe = nullptr;
+  return UR_RESULT_SUCCESS;
+}
+
+UR_DLLEXPORT ur_result_t UR_APICALL urGetGlobalProcAddrTable(
+    ur_api_version_t version, ur_global_dditable_t *pDdiTable) {
+  auto result = validateProcInputs(version, pDdiTable);
+  if (UR_RESULT_SUCCESS != result) {
+    return result;
+  }
+  pDdiTable->pfnAdapterGet = urAdapterGet;
+  pDdiTable->pfnAdapterGetInfo = urAdapterGetInfo;
+  pDdiTable->pfnAdapterRelease = urAdapterRelease;
+  pDdiTable->pfnAdapterRetain = urAdapterRetain;
+  pDdiTable->pfnAdapterGetLastError = nullptr;
+  return UR_RESULT_SUCCESS;
+}
+
+UR_DLLEXPORT ur_result_t UR_APICALL urGetQueueProcAddrTable(
+    ur_api_version_t version, ur_queue_dditable_t *pDdiTable) {
+  auto result = validateProcInputs(version, pDdiTable);
+  if (UR_RESULT_SUCCESS != result) {
+    return result;
+  }
+  pDdiTable->pfnCreate = urQueueCreate;
+  pDdiTable->pfnCreateWithNativeHandle = nullptr;
+  pDdiTable->pfnFinish = urQueueFinish;
+  pDdiTable->pfnFlush = nullptr;
+  pDdiTable->pfnGetInfo = nullptr;
+  pDdiTable->pfnGetNativeHandle = nullptr;
+  pDdiTable->pfnRelease = urQueueRelease;
+  pDdiTable->pfnRetain = urQueueRetain;
+  return UR_RESULT_SUCCESS;
+}
+
+UR_DLLEXPORT ur_result_t UR_APICALL
+urGetUSMProcAddrTable(ur_api_version_t version, ur_usm_dditable_t *pDdiTable) {
+  auto result = validateProcInputs(version, pDdiTable);
+  if (UR_RESULT_SUCCESS != result) {
+    return result;
+  }
+  pDdiTable->pfnDeviceAlloc = urUSMDeviceAlloc;
+  pDdiTable->pfnFree = urUSMFree;
+  pDdiTable->pfnGetMemAllocInfo = nullptr;
+  pDdiTable->pfnHostAlloc = urUSMHostAlloc;
+  pDdiTable->pfnPoolCreate = nullptr;
+  pDdiTable->pfnPoolRetain = nullptr;
+  pDdiTable->pfnPoolRelease = nullptr;
+  pDdiTable->pfnPoolGetInfo = nullptr;
+  pDdiTable->pfnSharedAlloc = urUSMSharedAlloc;
+  return UR_RESULT_SUCCESS;
+}
+
+UR_DLLEXPORT ur_result_t UR_APICALL urGetDeviceProcAddrTable(
+    ur_api_version_t version, ur_device_dditable_t *pDdiTable) {
+  auto result = validateProcInputs(version, pDdiTable);
+  if (UR_RESULT_SUCCESS != result) {
+    return result;
+  }
+  pDdiTable->pfnCreateWithNativeHandle = nullptr;
+  pDdiTable->pfnGet = urDeviceGet;
+  pDdiTable->pfnGetGlobalTimestamps = nullptr;
+  pDdiTable->pfnGetInfo = urDeviceGetInfo;
+  pDdiTable->pfnGetNativeHandle = nullptr;
+  pDdiTable->pfnPartition = urDevicePartition;
+  pDdiTable->pfnRelease = urDeviceRelease;
+  pDdiTable->pfnRetain = urDeviceRetain;
+  pDdiTable->pfnSelectBinary = urDeviceSelectBinary;
+  return UR_RESULT_SUCCESS;
+}
+
+UR_DLLEXPORT ur_result_t UR_APICALL urGetCommandBufferExpProcAddrTable(
+    ur_api_version_t version, ur_command_buffer_exp_dditable_t *pDdiTable) {
+  auto retVal = validateProcInputs(version, pDdiTable);
+  if (UR_RESULT_SUCCESS != retVal) {
+    return retVal;
+  }
+  pDdiTable->pfnCreateExp = nullptr;
+  pDdiTable->pfnRetainExp = nullptr;
+  pDdiTable->pfnReleaseExp = nullptr;
+  pDdiTable->pfnFinalizeExp = nullptr;
+  pDdiTable->pfnAppendKernelLaunchExp = nullptr;
+  pDdiTable->pfnAppendUSMMemcpyExp = nullptr;
+  pDdiTable->pfnAppendMemBufferCopyExp = nullptr;
+  pDdiTable->pfnAppendMemBufferCopyRectExp = nullptr;
+  pDdiTable->pfnAppendMemBufferReadExp = nullptr;
+  pDdiTable->pfnAppendMemBufferReadRectExp = nullptr;
+  pDdiTable->pfnAppendMemBufferWriteExp = nullptr;
+  pDdiTable->pfnAppendMemBufferWriteRectExp = nullptr;
+  pDdiTable->pfnUpdateKernelLaunchExp = nullptr;
+  pDdiTable->pfnGetInfoExp = nullptr;
+
+  return retVal;
+}
+
+UR_DLLEXPORT ur_result_t UR_APICALL urGetUsmP2PExpProcAddrTable(
+    ur_api_version_t version, ur_usm_p2p_exp_dditable_t *pDdiTable) {
+  auto retVal = validateProcInputs(version, pDdiTable);
+  if (UR_RESULT_SUCCESS != retVal) {
+    return retVal;
+  }
+  pDdiTable->pfnEnablePeerAccessExp = nullptr;
+  pDdiTable->pfnDisablePeerAccessExp = nullptr;
+  pDdiTable->pfnPeerAccessGetInfoExp = nullptr;
+
+  return retVal;
+}
+
+UR_DLLEXPORT ur_result_t UR_APICALL urGetBindlessImagesExpProcAddrTable(
+    ur_api_version_t version, ur_bindless_images_exp_dditable_t *pDdiTable) {
+  auto result = validateProcInputs(version, pDdiTable);
+  if (UR_RESULT_SUCCESS != result) {
+    return result;
+  }
+  pDdiTable->pfnUnsampledImageHandleDestroyExp = nullptr;
+  pDdiTable->pfnSampledImageHandleDestroyExp = nullptr;
+  pDdiTable->pfnImageAllocateExp = nullptr;
+  pDdiTable->pfnImageFreeExp = nullptr;
+  pDdiTable->pfnUnsampledImageCreateExp = nullptr;
+  pDdiTable->pfnSampledImageCreateExp = nullptr;
+  pDdiTable->pfnImageCopyExp = nullptr;
+  pDdiTable->pfnImageGetInfoExp = nullptr;
+  pDdiTable->pfnMipmapGetLevelExp = nullptr;
+  pDdiTable->pfnMipmapFreeExp = nullptr;
+  pDdiTable->pfnImportExternalMemoryExp = nullptr;
+  pDdiTable->pfnMapExternalArrayExp = nullptr;
+  pDdiTable->pfnMapExternalLinearMemoryExp = nullptr;
+  pDdiTable->pfnReleaseExternalMemoryExp = nullptr;
+  pDdiTable->pfnImportExternalSemaphoreExp = nullptr;
+  pDdiTable->pfnReleaseExternalSemaphoreExp = nullptr;
+  pDdiTable->pfnWaitExternalSemaphoreExp = nullptr;
+  pDdiTable->pfnSignalExternalSemaphoreExp = nullptr;
+  return UR_RESULT_SUCCESS;
+}
+
+UR_DLLEXPORT ur_result_t UR_APICALL urGetPhysicalMemProcAddrTable(
+    ur_api_version_t version, ur_physical_mem_dditable_t *pDdiTable) {
+  auto retVal = validateProcInputs(version, pDdiTable);
+  if (UR_RESULT_SUCCESS != retVal) {
+    return retVal;
+  }
+
+  pDdiTable->pfnCreate = nullptr;
+  pDdiTable->pfnRelease = nullptr;
+  pDdiTable->pfnRetain = nullptr;
+
+  return retVal;
+}
+
+UR_DLLEXPORT ur_result_t UR_APICALL urGetUSMExpProcAddrTable(
+    ur_api_version_t version, ur_usm_exp_dditable_t *pDdiTable) {
+  auto result = validateProcInputs(version, pDdiTable);
+  if (UR_RESULT_SUCCESS != result) {
+    return result;
+  }
+  pDdiTable->pfnPitchedAllocExp = nullptr;
+  return UR_RESULT_SUCCESS;
+}
+
+UR_DLLEXPORT ur_result_t UR_APICALL urGetVirtualMemProcAddrTable(
+    ur_api_version_t version, ///< [in] API version requested
+    ur_virtual_mem_dditable_t
+        *pDdiTable ///< [in,out] pointer to table of DDI function pointers
+) {
+  auto retVal = validateProcInputs(version, pDdiTable);
+  if (UR_RESULT_SUCCESS != retVal) {
+    return retVal;
+  }
+
+  pDdiTable->pfnFree = nullptr;
+  pDdiTable->pfnGetInfo = nullptr;
+  pDdiTable->pfnGranularityGetInfo = nullptr;
+  pDdiTable->pfnMap = nullptr;
+  pDdiTable->pfnReserve = nullptr;
+  pDdiTable->pfnSetAccess = nullptr;
+  pDdiTable->pfnUnmap = nullptr;
+
+  return retVal;
+}
+
+UR_DLLEXPORT ur_result_t UR_APICALL urGetEnqueueExpProcAddrTable(
+    ur_api_version_t version, ur_enqueue_exp_dditable_t *pDdiTable) {
+  auto result = validateProcInputs(version, pDdiTable);
+  if (UR_RESULT_SUCCESS != result) {
+    return result;
+  }
+
+  pDdiTable->pfnCooperativeKernelLaunchExp = nullptr;
+  pDdiTable->pfnTimestampRecordingExp = nullptr;
+  pDdiTable->pfnNativeCommandExp = nullptr;
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_DLLEXPORT ur_result_t UR_APICALL urGetKernelExpProcAddrTable(
+    ur_api_version_t version, ur_kernel_exp_dditable_t *pDdiTable) {
+  auto result = validateProcInputs(version, pDdiTable);
+  if (UR_RESULT_SUCCESS != result) {
+    return result;
+  }
+
+  pDdiTable->pfnSuggestMaxCooperativeGroupCountExp = nullptr;
+
+  return UR_RESULT_SUCCESS;
+}
+
+UR_DLLEXPORT ur_result_t UR_APICALL urGetProgramExpProcAddrTable(
+    ur_api_version_t version, ur_program_exp_dditable_t *pDdiTable) {
+  auto result = validateProcInputs(version, pDdiTable);
+  if (UR_RESULT_SUCCESS != result) {
+    return result;
+  }
+
+  pDdiTable->pfnBuildExp = urProgramBuildExp;
+  pDdiTable->pfnCompileExp = nullptr;
+  pDdiTable->pfnLinkExp = nullptr;
+
+  return UR_RESULT_SUCCESS;
+}
+} // extern "C"
diff --git a/unified-runtime/source/adapters/offload/usm.cpp b/unified-runtime/source/adapters/offload/usm.cpp
new file mode 100644
index 0000000000000..a597cf87c21b1
--- /dev/null
+++ b/unified-runtime/source/adapters/offload/usm.cpp
@@ -0,0 +1,53 @@
+#include <OffloadAPI.h>
+#include <ur/ur.hpp>
+#include <ur_api.h>
+
+#include "context.hpp"
+#include "ur2offload.hpp"
+
+UR_APIEXPORT ur_result_t UR_APICALL urUSMHostAlloc(ur_context_handle_t hContext,
+                                                   const ur_usm_desc_t *,
+                                                   ur_usm_pool_handle_t,
+                                                   size_t size, void **ppMem) {
+  auto Res = olMemAlloc(reinterpret_cast<ol_device_handle_t>(hContext->Device),
+                        OL_ALLOC_TYPE_HOST, size, ppMem);
+
+  if (Res != OL_SUCCESS) {
+    return offloadResultToUR(Res);
+  }
+
+  hContext->AllocTypeMap.insert_or_assign(*ppMem, OL_ALLOC_TYPE_HOST);
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urUSMDeviceAlloc(
+    ur_context_handle_t hContext, ur_device_handle_t, const ur_usm_desc_t *,
+    ur_usm_pool_handle_t, size_t size, void **ppMem) {
+  auto Res = olMemAlloc(reinterpret_cast<ol_device_handle_t>(hContext->Device),
+                        OL_ALLOC_TYPE_DEVICE, size, ppMem);
+
+  if (Res != OL_SUCCESS) {
+    return offloadResultToUR(Res);
+  }
+
+  hContext->AllocTypeMap.insert_or_assign(*ppMem, OL_ALLOC_TYPE_DEVICE);
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urUSMSharedAlloc(
+    ur_context_handle_t hContext, ur_device_handle_t, const ur_usm_desc_t *,
+    ur_usm_pool_handle_t, size_t size, void **ppMem) {
+  auto Res = olMemAlloc(reinterpret_cast<ol_device_handle_t>(hContext->Device),
+                        OL_ALLOC_TYPE_MANAGED, size, ppMem);
+
+  if (Res != OL_SUCCESS) {
+    return offloadResultToUR(Res);
+  }
+
+  hContext->AllocTypeMap.insert_or_assign(*ppMem, OL_ALLOC_TYPE_MANAGED);
+  return UR_RESULT_SUCCESS;
+}
+
+UR_APIEXPORT ur_result_t UR_APICALL urUSMFree(ur_context_handle_t, void *pMem) {
+  return offloadResultToUR(olMemFree(pMem));
+}

From 29e361f04fef8cb85ae426b4256f137a5cf0464d Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross@codeplay.com>
Date: Thu, 24 Apr 2025 12:01:48 +0100
Subject: [PATCH 02/11] Support compiling on non-CUDA

This makes the dependency on cudadrv optional and ifdefs away the cubin
workaround if it isn't. This isn't sufficient to have HIP devices
compile kernels, but does allow libur_adapter_offload to be built on
said hosts.

In addition, an unused variable error was fixed.
---
 .../source/adapters/offload/CMakeLists.txt    | 10 ++-
 .../source/adapters/offload/adapter.cpp       |  2 +
 .../source/adapters/offload/program.cpp       | 86 ++++++++++++-------
 .../test/conformance/CMakeLists.txt           |  2 +-
 4 files changed, 68 insertions(+), 32 deletions(-)

diff --git a/unified-runtime/source/adapters/offload/CMakeLists.txt b/unified-runtime/source/adapters/offload/CMakeLists.txt
index d3559c2ae7761..6461fabe647b2 100644
--- a/unified-runtime/source/adapters/offload/CMakeLists.txt
+++ b/unified-runtime/source/adapters/offload/CMakeLists.txt
@@ -14,7 +14,7 @@ endif()
 
 # For the PTX workaround we need to link with CUDA.
 if (NOT TARGET cudadrv)
-        find_package(CUDA 10.1 REQUIRED)
+        find_package(CUDA 10.1)
         add_library(cudadrv SHARED IMPORTED GLOBAL)
         set_target_properties(
                 cudadrv PROPERTIES 
@@ -49,9 +49,15 @@ target_link_libraries(${TARGET_NAME} PRIVATE
         ${PROJECT_NAME}::common
         ${PROJECT_NAME}::umf
         ${UR_OFFLOAD_INSTALL_DIR}/lib/libLLVMOffload.so
-        cudadrv
 )
 
+if (CUDA_CUDA_LIBRARY)
+    target_link_libraries(${TARGET_NAME}
+        cudadrv
+    )
+    target_compile_definitions(${TARGET_NAME} PRIVATE UR_CUDA_ENABLED=1)
+endif()
+
 target_include_directories(${TARGET_NAME} PRIVATE
         "${UR_OFFLOAD_INCLUDE_DIR}/offload"
         "${CMAKE_CURRENT_SOURCE_DIR}/../../"
diff --git a/unified-runtime/source/adapters/offload/adapter.cpp b/unified-runtime/source/adapters/offload/adapter.cpp
index bde5dc8e8e6a5..6c4962a69e748 100644
--- a/unified-runtime/source/adapters/offload/adapter.cpp
+++ b/unified-runtime/source/adapters/offload/adapter.cpp
@@ -46,6 +46,8 @@ ur_result_t ur_adapter_handle_t_::init() {
       },
       this);
 
+  (void)Res;
+
   return UR_RESULT_SUCCESS;
 }
 
diff --git a/unified-runtime/source/adapters/offload/program.cpp b/unified-runtime/source/adapters/offload/program.cpp
index a55644efdd4f7..8067a5d17056a 100644
--- a/unified-runtime/source/adapters/offload/program.cpp
+++ b/unified-runtime/source/adapters/offload/program.cpp
@@ -1,12 +1,63 @@
 #include <OffloadAPI.h>
 #include <ur/ur.hpp>
 #include <ur_api.h>
-#include <cuda.h>
 
 #include "context.hpp"
 #include "program.hpp"
 #include "ur2offload.hpp"
 
+#ifdef UR_CUDA_ENABLED
+#include <cuda.h>
+#endif
+
+namespace {
+// Workaround for Offload not supporting PTX binaries. Force CUDA programs
+// to be linked so they end up as CUBIN.
+#ifdef UR_CUDA_ENABLED
+ur_result_t ProgramCreateCudaWorkaround(ur_context_handle_t hContext,
+                                        const uint8_t *Binary, size_t Length,
+                                        ur_program_handle_t *phProgram) {
+  uint8_t *RealBinary;
+  size_t RealLength;
+  CUlinkState State;
+  cuLinkCreate(0, nullptr, nullptr, &State);
+
+  cuLinkAddData(State, CU_JIT_INPUT_PTX, (char *)(Binary), Length, nullptr, 0,
+                nullptr, nullptr);
+
+  void *CuBin = nullptr;
+  size_t CuBinSize = 0;
+  cuLinkComplete(State, &CuBin, &CuBinSize);
+  RealBinary = (uint8_t *)CuBin;
+  RealLength = CuBinSize;
+  fprintf(stderr, "Performed CUDA bin workaround (size = %lu)\n", RealLength);
+
+  ur_program_handle_t Program = new ur_program_handle_t_();
+  auto Res =
+      olCreateProgram(reinterpret_cast<ol_device_handle_t>(hContext->Device),
+                      RealBinary, RealLength, &Program->OffloadProgram);
+
+  // Program owns the linked module now
+  cuLinkDestroy(State);
+  (void)State;
+
+  if (Res != OL_SUCCESS) {
+    delete Program;
+    return offloadResultToUR(Res);
+  }
+
+  *phProgram = Program;
+
+  return UR_RESULT_SUCCESS;
+}
+#else
+ur_result_t ProgramCreateCudaWorkaround(ur_context_handle_t, const uint8_t *,
+                                        size_t, ur_program_handle_t *) {
+  return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
+}
+#endif
+} // namespace
+
 UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary(
     ur_context_handle_t hContext, uint32_t numDevices,
     ur_device_handle_t *phDevices, size_t *pLengths, const uint8_t **ppBinaries,
@@ -15,45 +66,23 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary(
     return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
   }
 
-  // Workaround for Offload not supporting PTX binaries. Force CUDA programs
-  // to be linked so they end up as CUBIN.
-  uint8_t *RealBinary;
-  size_t RealLength;
   ur_platform_handle_t DevicePlatform;
-  bool DidLink = false;
-  CUlinkState State;
   urDeviceGetInfo(phDevices[0], UR_DEVICE_INFO_PLATFORM,
                   sizeof(ur_platform_handle_t), &DevicePlatform, nullptr);
   ur_platform_backend_t PlatformBackend;
   urPlatformGetInfo(DevicePlatform, UR_PLATFORM_INFO_BACKEND,
                     sizeof(ur_platform_backend_t), &PlatformBackend, nullptr);
   if (PlatformBackend == UR_PLATFORM_BACKEND_CUDA) {
-    cuLinkCreate(0, nullptr, nullptr, &State);
-
-    cuLinkAddData(State, CU_JIT_INPUT_PTX, (char *)(ppBinaries[0]), pLengths[0],
-                  nullptr, 0, nullptr, nullptr);
-
-    void *CuBin = nullptr;
-    size_t CuBinSize = 0;
-    cuLinkComplete(State, &CuBin, &CuBinSize);
-    RealBinary = (uint8_t *)CuBin;
-    RealLength = CuBinSize;
-    DidLink = true;
-    fprintf(stderr, "Performed CUDA bin workaround (size = %lu)\n", RealLength);
-  } else {
-    RealBinary = const_cast<uint8_t *>(ppBinaries[0]);
-    RealLength = pLengths[0];
+    return ProgramCreateCudaWorkaround(hContext, ppBinaries[0], pLengths[0],
+                                       phProgram);
   }
 
+  auto *RealBinary = const_cast<uint8_t *>(ppBinaries[0]);
+
   ur_program_handle_t Program = new ur_program_handle_t_();
   auto Res =
       olCreateProgram(reinterpret_cast<ol_device_handle_t>(hContext->Device),
-                      RealBinary, RealLength, &Program->OffloadProgram);
-
-  // Program owns the linked module now
-  if (DidLink) {
-    cuLinkDestroy(State);
-  }
+                      RealBinary, pLengths[0], &Program->OffloadProgram);
 
   if (Res != OL_SUCCESS) {
     delete Program;
@@ -80,7 +109,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramBuildExp(ur_program_handle_t,
   return UR_RESULT_SUCCESS;
 }
 
-
 UR_APIEXPORT ur_result_t UR_APICALL
 urProgramRetain(ur_program_handle_t hProgram) {
   hProgram->RefCount++;
diff --git a/unified-runtime/test/conformance/CMakeLists.txt b/unified-runtime/test/conformance/CMakeLists.txt
index 9e29e727e2e2a..73f7f10aaabf6 100644
--- a/unified-runtime/test/conformance/CMakeLists.txt
+++ b/unified-runtime/test/conformance/CMakeLists.txt
@@ -131,7 +131,7 @@ if(UR_FOUND_DPCXX)
         if(UR_BUILD_ADAPTER_CUDA OR UR_BUILD_ADAPTER_ALL)
             list(APPEND TARGET_TRIPLES "nvptx64-nvidia-cuda")
         endif()
-        if(UR_BUILD_ADAPTER_HIP OR UR_BUILD_ADAPTER_ALL)
+        if(UR_BUILD_ADAPTER_HIP OR UR_BUILD_ADAPTER_OFFLOAD OR UR_BUILD_ADAPTER_ALL)
             list(APPEND TARGET_TRIPLES "amdgcn-amd-amdhsa")
         endif()
     else()

From fee5bd1ea7f48bd76006bb57de9442597d6a0910 Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross@codeplay.com>
Date: Thu, 24 Apr 2025 12:49:35 +0100
Subject: [PATCH 03/11] Add "offload" as an adapter to the various registries

This adds "offload" to several locations, meaning that:
* It will be initialised by the loader and iterated like other adapters.
* ONEAPI_DEVICE_SELECTOR="offload:*" works (note that this is an
  extension to the ONEAPI_DEVICE_SELECTOR format).
* Platforms and adapters now report themselves as "OFFLOAD" rather than
  "CUDA" or "HIP".
---
 unified-runtime/include/ur_api.h              |  4 ++++
 unified-runtime/include/ur_print.hpp          |  6 ++++++
 unified-runtime/scripts/core/adapter.yml      |  3 +++
 unified-runtime/scripts/core/manifests.yml    |  7 +++++++
 unified-runtime/scripts/core/platform.yml     |  3 +++
 .../source/adapters/CMakeLists.txt            |  1 +
 .../source/adapters/offload/adapter.cpp       |  2 +-
 .../source/adapters/offload/platform.cpp      | 20 +------------------
 .../source/loader/ur_adapter_registry.hpp     |  1 +
 unified-runtime/source/loader/ur_lib.cpp      |  6 ++++--
 .../source/loader/ur_manifests.hpp            |  7 +++++++
 11 files changed, 38 insertions(+), 22 deletions(-)

diff --git a/unified-runtime/include/ur_api.h b/unified-runtime/include/ur_api.h
index dcf05b2b066c7..36fe8ac06bf94 100644
--- a/unified-runtime/include/ur_api.h
+++ b/unified-runtime/include/ur_api.h
@@ -1422,6 +1422,8 @@ typedef enum ur_adapter_backend_t {
   UR_ADAPTER_BACKEND_HIP = 4,
   /// The backend is Native CPU
   UR_ADAPTER_BACKEND_NATIVE_CPU = 5,
+  /// The backend is liboffload
+  UR_ADAPTER_BACKEND_OFFLOAD = 0x100,
   /// @cond
   UR_ADAPTER_BACKEND_FORCE_UINT32 = 0x7fffffff
   /// @endcond
@@ -1801,6 +1803,8 @@ typedef enum ur_platform_backend_t {
   UR_PLATFORM_BACKEND_HIP = 4,
   /// The backend is Native CPU
   UR_PLATFORM_BACKEND_NATIVE_CPU = 5,
+  /// The backend is liboffload
+  UR_PLATFORM_BACKEND_OFFLOAD = 0x100,
   /// @cond
   UR_PLATFORM_BACKEND_FORCE_UINT32 = 0x7fffffff
   /// @endcond
diff --git a/unified-runtime/include/ur_print.hpp b/unified-runtime/include/ur_print.hpp
index c5333f76f478e..63656894ae0e9 100644
--- a/unified-runtime/include/ur_print.hpp
+++ b/unified-runtime/include/ur_print.hpp
@@ -2356,6 +2356,9 @@ inline std::ostream &operator<<(std::ostream &os,
   case UR_ADAPTER_BACKEND_NATIVE_CPU:
     os << "UR_ADAPTER_BACKEND_NATIVE_CPU";
     break;
+  case UR_ADAPTER_BACKEND_OFFLOAD:
+    os << "UR_ADAPTER_BACKEND_OFFLOAD";
+    break;
   default:
     os << "unknown enumerator";
     break;
@@ -2553,6 +2556,9 @@ inline std::ostream &operator<<(std::ostream &os,
   case UR_PLATFORM_BACKEND_NATIVE_CPU:
     os << "UR_PLATFORM_BACKEND_NATIVE_CPU";
     break;
+  case UR_PLATFORM_BACKEND_OFFLOAD:
+    os << "UR_PLATFORM_BACKEND_OFFLOAD";
+    break;
   default:
     os << "unknown enumerator";
     break;
diff --git a/unified-runtime/scripts/core/adapter.yml b/unified-runtime/scripts/core/adapter.yml
index d806d48974a4f..8253104386b05 100644
--- a/unified-runtime/scripts/core/adapter.yml
+++ b/unified-runtime/scripts/core/adapter.yml
@@ -209,6 +209,9 @@ etors:
     - name: NATIVE_CPU
       value: "5"
       desc: "The backend is Native CPU"
+    - name: OFFLOAD
+      value: "0x100"
+      desc: "The backend is liboffload"
 --- #--------------------------------------------------------------------------
 type: enum
 desc: "Minimum level of messages to be processed by the logger."
diff --git a/unified-runtime/scripts/core/manifests.yml b/unified-runtime/scripts/core/manifests.yml
index 6b9647852daea..da58ebb57df34 100644
--- a/unified-runtime/scripts/core/manifests.yml
+++ b/unified-runtime/scripts/core/manifests.yml
@@ -61,3 +61,10 @@ name: native_cpu
 backend: $X_ADAPTER_BACKEND_NATIVE_CPU
 device_types:
     - $X_DEVICE_TYPE_CPU
+--- #--------------------------------------------------------------------------
+type: manifest
+name: offload
+backend: $X_ADAPTER_BACKEND_OFFLOAD
+device_types:
+    - $X_DEVICE_TYPE_CPU
+    - $X_DEVICE_TYPE_GPU
diff --git a/unified-runtime/scripts/core/platform.yml b/unified-runtime/scripts/core/platform.yml
index 7d4edf5c0b5c0..84c7a99d6e833 100644
--- a/unified-runtime/scripts/core/platform.yml
+++ b/unified-runtime/scripts/core/platform.yml
@@ -279,3 +279,6 @@ etors:
     - name: NATIVE_CPU
       value: "5"
       desc: "The backend is Native CPU"
+    - name: OFFLOAD
+      value: "0x100"
+      desc: "The backend is liboffload"
diff --git a/unified-runtime/source/adapters/CMakeLists.txt b/unified-runtime/source/adapters/CMakeLists.txt
index 34ba19b6e8859..8c357caa21946 100644
--- a/unified-runtime/source/adapters/CMakeLists.txt
+++ b/unified-runtime/source/adapters/CMakeLists.txt
@@ -76,6 +76,7 @@ endif()
 
 if(UR_BUILD_ADAPTER_OFFLOAD)
     add_ur_adapter_subdirectory(offload)
+    list(APPEND TEMP_LIST "offload")
 endif()
 
 set(UR_ADAPTERS_LIST "${TEMP_LIST}" CACHE STRING "" FORCE)
diff --git a/unified-runtime/source/adapters/offload/adapter.cpp b/unified-runtime/source/adapters/offload/adapter.cpp
index 6c4962a69e748..6299bd3280de1 100644
--- a/unified-runtime/source/adapters/offload/adapter.cpp
+++ b/unified-runtime/source/adapters/offload/adapter.cpp
@@ -87,7 +87,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urAdapterGetInfo(ur_adapter_handle_t,
 
   switch (propName) {
   case UR_ADAPTER_INFO_BACKEND:
-    return ReturnValue(UR_ADAPTER_BACKEND_CUDA); // TODO: Return a proper value
+    return ReturnValue(UR_ADAPTER_BACKEND_OFFLOAD);
   case UR_ADAPTER_INFO_REFERENCE_COUNT:
     return ReturnValue(Adapter.RefCount.load());
   default:
diff --git a/unified-runtime/source/adapters/offload/platform.cpp b/unified-runtime/source/adapters/offload/platform.cpp
index 50b9ac90649ce..02b992c96351d 100644
--- a/unified-runtime/source/adapters/offload/platform.cpp
+++ b/unified-runtime/source/adapters/offload/platform.cpp
@@ -49,7 +49,7 @@ urPlatformGetInfo(ur_platform_handle_t hPlatform, ur_platform_info_t propName,
   case UR_PLATFORM_INFO_PROFILE:
     return ReturnValue("FULL_PROFILE");
   case UR_PLATFORM_INFO_BACKEND:
-    olInfo = OL_PLATFORM_INFO_BACKEND;
+    return ReturnValue(UR_PLATFORM_BACKEND_OFFLOAD);
     break;
   default:
     return UR_RESULT_ERROR_INVALID_ENUMERATION;
@@ -69,24 +69,6 @@ urPlatformGetInfo(ur_platform_handle_t hPlatform, ur_platform_info_t propName,
                               olInfo, propSize, pPropValue)) {
       return offloadResultToUR(Res);
     }
-
-    // Need to explicitly map this type
-    if (olInfo == OL_PLATFORM_INFO_BACKEND) {
-      auto urPropPtr = reinterpret_cast<ur_platform_backend_t *>(pPropValue);
-      auto olPropPtr = reinterpret_cast<ol_platform_backend_t *>(pPropValue);
-
-      switch (*olPropPtr) {
-      case OL_PLATFORM_BACKEND_CUDA:
-        *urPropPtr = UR_PLATFORM_BACKEND_CUDA;
-        break;
-      case OL_PLATFORM_BACKEND_AMDGPU:
-        *urPropPtr = UR_PLATFORM_BACKEND_HIP;
-        break;
-      default:
-        *urPropPtr = UR_PLATFORM_BACKEND_UNKNOWN;
-        break;
-      }
-    }
   }
 
   return UR_RESULT_SUCCESS;
diff --git a/unified-runtime/source/loader/ur_adapter_registry.hpp b/unified-runtime/source/loader/ur_adapter_registry.hpp
index 53fed55ea0c8a..36e5e9a602756 100644
--- a/unified-runtime/source/loader/ur_adapter_registry.hpp
+++ b/unified-runtime/source/loader/ur_adapter_registry.hpp
@@ -40,6 +40,7 @@ struct FilterTerm {
       {"cuda", UR_ADAPTER_BACKEND_CUDA},
       {"hip", UR_ADAPTER_BACKEND_HIP},
       {"native_cpu", UR_ADAPTER_BACKEND_NATIVE_CPU},
+      {"offload", UR_ADAPTER_BACKEND_OFFLOAD},
   };
 
   bool matchesBackend(const ur_adapter_backend_t &match_backend) const {
diff --git a/unified-runtime/source/loader/ur_lib.cpp b/unified-runtime/source/loader/ur_lib.cpp
index 8163be0fbaebf..c224ca00b1777 100644
--- a/unified-runtime/source/loader/ur_lib.cpp
+++ b/unified-runtime/source/loader/ur_lib.cpp
@@ -251,13 +251,15 @@ ur_result_t urDeviceGetSelected(ur_platform_handle_t hPlatform,
                                 uint32_t NumEntries,
                                 ur_device_handle_t *phDevices,
                                 uint32_t *pNumDevices) {
-  constexpr std::pair<const ur_platform_backend_t, const char *> adapters[6] = {
+  constexpr std::pair<const ur_platform_backend_t, const char *> adapters[7] = {
       {UR_PLATFORM_BACKEND_UNKNOWN, "*"},
       {UR_PLATFORM_BACKEND_LEVEL_ZERO, "level_zero"},
       {UR_PLATFORM_BACKEND_OPENCL, "opencl"},
       {UR_PLATFORM_BACKEND_CUDA, "cuda"},
       {UR_PLATFORM_BACKEND_HIP, "hip"},
-      {UR_PLATFORM_BACKEND_NATIVE_CPU, "native_cpu"}};
+      {UR_PLATFORM_BACKEND_NATIVE_CPU, "native_cpu"},
+      {UR_PLATFORM_BACKEND_OFFLOAD, "offload"},
+    };
 
   if (!hPlatform) {
     return UR_RESULT_ERROR_INVALID_NULL_HANDLE;
diff --git a/unified-runtime/source/loader/ur_manifests.hpp b/unified-runtime/source/loader/ur_manifests.hpp
index 2ed89fc79f4f0..9981d8d1fa5bf 100644
--- a/unified-runtime/source/loader/ur_manifests.hpp
+++ b/unified-runtime/source/loader/ur_manifests.hpp
@@ -79,5 +79,12 @@ const std::vector<ur_adapter_manifest> ur_adapter_manifests = {
      {
          UR_DEVICE_TYPE_CPU,
      }},
+    {"offload",
+     MAKE_LIBRARY_NAME("ur_adapter_offload", "0"),
+     UR_ADAPTER_BACKEND_OFFLOAD,
+     {
+         UR_DEVICE_TYPE_CPU,
+         UR_DEVICE_TYPE_GPU,
+     }},
 };
 } // namespace ur_loader

From e1c2d4fcfe3d90c793f768d53b4935425de5b979 Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross@codeplay.com>
Date: Thu, 24 Apr 2025 14:25:26 +0100
Subject: [PATCH 04/11] Quick CTS running for Offload

It is non-trivial which binary format (spir, ptx, amdhsa) is accepted by
a given offload device. This should be fixed properly in the future, but
for now let the user specify it via an environment variable.
---
 unified-runtime/test/conformance/CMakeLists.txt     |  4 ++--
 .../test/conformance/platform/urPlatformGetInfo.cpp |  2 +-
 .../test/conformance/source/environment.cpp         | 13 ++++++++++++-
 3 files changed, 15 insertions(+), 4 deletions(-)

diff --git a/unified-runtime/test/conformance/CMakeLists.txt b/unified-runtime/test/conformance/CMakeLists.txt
index 73f7f10aaabf6..182b80affb423 100644
--- a/unified-runtime/test/conformance/CMakeLists.txt
+++ b/unified-runtime/test/conformance/CMakeLists.txt
@@ -125,10 +125,10 @@ if(UR_FOUND_DPCXX)
     file(MAKE_DIRECTORY ${UR_CONFORMANCE_DEVICE_BINARIES_DIR})
 
     if("${UR_CONFORMANCE_TARGET_TRIPLES}" STREQUAL "")
-        if(UR_BUILD_ADAPTER_L0 OR UR_BUILD_ADAPTER_L0_V2 OR UR_BUILD_ADAPTER_OPENCL OR UR_BUILD_ADAPTER_ALL)
+        if(UR_BUILD_ADAPTER_L0 OR UR_BUILD_ADAPTER_OFFLOAD OR UR_BUILD_ADAPTER_L0_V2 OR UR_BUILD_ADAPTER_OPENCL OR UR_BUILD_ADAPTER_ALL)
             list(APPEND TARGET_TRIPLES "spir64")
         endif()
-        if(UR_BUILD_ADAPTER_CUDA OR UR_BUILD_ADAPTER_ALL)
+        if(UR_BUILD_ADAPTER_CUDA OR UR_BUILD_ADAPTER_OFFLOAD OR UR_BUILD_ADAPTER_ALL)
             list(APPEND TARGET_TRIPLES "nvptx64-nvidia-cuda")
         endif()
         if(UR_BUILD_ADAPTER_HIP OR UR_BUILD_ADAPTER_OFFLOAD OR UR_BUILD_ADAPTER_ALL)
diff --git a/unified-runtime/test/conformance/platform/urPlatformGetInfo.cpp b/unified-runtime/test/conformance/platform/urPlatformGetInfo.cpp
index 71a55c93292b2..06b269f54ae2d 100644
--- a/unified-runtime/test/conformance/platform/urPlatformGetInfo.cpp
+++ b/unified-runtime/test/conformance/platform/urPlatformGetInfo.cpp
@@ -106,7 +106,7 @@ TEST_P(urPlatformGetInfoTest, SuccessBackend) {
                                    &property_value, nullptr));
 
   ASSERT_TRUE(property_value >= UR_PLATFORM_BACKEND_LEVEL_ZERO &&
-              property_value <= UR_PLATFORM_BACKEND_NATIVE_CPU);
+              property_value <= UR_PLATFORM_BACKEND_OFFLOAD);
 }
 
 TEST_P(urPlatformGetInfoTest, SuccessAdapter) {
diff --git a/unified-runtime/test/conformance/source/environment.cpp b/unified-runtime/test/conformance/source/environment.cpp
index d1fe951e5ce00..d2f63c37929d8 100644
--- a/unified-runtime/test/conformance/source/environment.cpp
+++ b/unified-runtime/test/conformance/source/environment.cpp
@@ -215,6 +215,16 @@ std::string KernelsEnvironment::getTargetName(ur_platform_handle_t platform) {
     return "nvptx64-nvidia-cuda";
   case UR_PLATFORM_BACKEND_HIP:
     return "amdgcn-amd-amdhsa";
+  case UR_PLATFORM_BACKEND_OFFLOAD: {
+    // TODO: In future this should use urDeviceSelectBinary
+    auto result = ur_getenv("UR_OFFLOAD_TARGET_NAME");
+    if (!result) {
+      error = "For offload testing, please specify a target in "
+              "`UR_OFFLOAD_TARGET_NAME`";
+      return {};
+    }
+    return *result;
+  }
   case UR_PLATFORM_BACKEND_NATIVE_CPU:
     error = "native_cpu doesn't support kernel tests yet";
     return {};
@@ -297,7 +307,8 @@ void KernelsEnvironment::CreateProgram(
                                    sizeof(ur_platform_backend_t), &backend,
                                    nullptr));
   if (backend == UR_PLATFORM_BACKEND_HIP ||
-      backend == UR_PLATFORM_BACKEND_CUDA) {
+      backend == UR_PLATFORM_BACKEND_CUDA ||
+      backend == UR_PLATFORM_BACKEND_OFFLOAD) {
     // The CUDA and HIP adapters do not support urProgramCreateWithIL so we
     // need to use urProgramCreateWithBinary instead.
     auto size = binary.size();

From e5de2c4d9c84724d4d2f3ffef0ba8295d8fb85a0 Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross@codeplay.com>
Date: Tue, 29 Apr 2025 10:24:36 +0100
Subject: [PATCH 05/11] Parse HIP "offload" bundles

SYCL and the UR CTS produce HIP "offload bundles" when compiling for
AMDGPU, which cannot be accepted by the basic AMD offload plugin. This
change adds a simple offload bundle parser which extracts the
appropriate binary from the bundle, allowing it to be fed to liboffload.
---
 .../source/adapters/offload/program.cpp       | 105 +++++++++++++++++-
 1 file changed, 101 insertions(+), 4 deletions(-)

diff --git a/unified-runtime/source/adapters/offload/program.cpp b/unified-runtime/source/adapters/offload/program.cpp
index 8067a5d17056a..acafef8b73d66 100644
--- a/unified-runtime/source/adapters/offload/program.cpp
+++ b/unified-runtime/source/adapters/offload/program.cpp
@@ -56,6 +56,86 @@ ur_result_t ProgramCreateCudaWorkaround(ur_context_handle_t, const uint8_t *,
   return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
 }
 #endif
+
+// https://clang.llvm.org/docs/ClangOffloadBundler.html#bundled-binary-file-layout
+class HipOffloadBundleParser {
+  static constexpr std::string_view Magic = "__CLANG_OFFLOAD_BUNDLE__";
+  const uint8_t *Buff;
+  size_t Length;
+
+  struct __attribute__((packed)) BundleEntry {
+    uint64_t ObjectOffset;
+    uint64_t ObjectSize;
+    uint64_t EntryIdSize;
+    char EntryIdStart;
+  };
+
+  struct __attribute__((packed)) BundleHeader {
+    const char HeaderMagic[Magic.size()];
+    uint64_t EntryCount;
+    BundleEntry FirstEntry;
+  };
+
+  HipOffloadBundleParser() = delete;
+  HipOffloadBundleParser(const uint8_t *Buff, size_t Length)
+      : Buff(Buff), Length(Length) {}
+
+public:
+  static std::optional<HipOffloadBundleParser> load(const uint8_t *Buff,
+                                                    size_t Length) {
+    if (std::string_view{reinterpret_cast<const char *>(Buff), Length}.find(
+            Magic) != 0) {
+      return std::nullopt;
+    }
+    return HipOffloadBundleParser(Buff, Length);
+  }
+
+  ur_result_t extract(std::string_view SearchTargetId,
+                      const uint8_t *&OutBinary, size_t &OutLength) {
+    const char *Limit = reinterpret_cast<const char *>(&Buff[Length]);
+
+    // The different check here means that a binary consisting of only the magic
+    // bytes (but nothing else) will result in INVALID_PROGRAM rather than being
+    // treated as a non-bundle
+    auto *Header = reinterpret_cast<const BundleHeader *>(Buff);
+    if (reinterpret_cast<const char *>(&Header->FirstEntry) > Limit) {
+      return UR_RESULT_ERROR_INVALID_PROGRAM;
+    }
+
+    const auto *CurrentEntry = &Header->FirstEntry;
+    for (uint64_t I = 0; I < Header->EntryCount; I++) {
+      if (&CurrentEntry->EntryIdStart > Limit) {
+        return UR_RESULT_ERROR_INVALID_PROGRAM;
+      }
+      auto EntryId = std::string_view(&CurrentEntry->EntryIdStart,
+                                      CurrentEntry->EntryIdSize);
+      if (EntryId.end() > Limit) {
+        return UR_RESULT_ERROR_INVALID_PROGRAM;
+      }
+
+      // Will match either "hip" or "hipv4"
+      bool isHip = EntryId.find("hip") == 0;
+      bool VersionMatches =
+          EntryId.find_last_of(SearchTargetId) == EntryId.size() - 1;
+
+      if (isHip && VersionMatches) {
+        OutBinary = reinterpret_cast<const uint8_t *>(
+            &Buff[CurrentEntry->ObjectOffset]);
+        OutLength = CurrentEntry->ObjectSize;
+
+        if (reinterpret_cast<const char *>(&OutBinary[OutLength]) > Limit) {
+          return UR_RESULT_ERROR_INVALID_PROGRAM;
+        }
+        return UR_RESULT_SUCCESS;
+      }
+
+      CurrentEntry = reinterpret_cast<const BundleEntry *>(EntryId.end());
+    }
+
+    return UR_RESULT_ERROR_INVALID_PROGRAM;
+  }
+};
+
 } // namespace
 
 UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary(
@@ -72,17 +152,34 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary(
   ur_platform_backend_t PlatformBackend;
   urPlatformGetInfo(DevicePlatform, UR_PLATFORM_INFO_BACKEND,
                     sizeof(ur_platform_backend_t), &PlatformBackend, nullptr);
+
+  auto *RealBinary = ppBinaries[0];
+  size_t RealLength = pLengths[0];
+
+  if (auto Parser = HipOffloadBundleParser::load(RealBinary, RealLength)) {
+    std::string DevName{};
+    size_t DevNameLength;
+    olGetDeviceInfoSize(reinterpret_cast<ol_device_handle_t>(phDevices[0]),
+                        OL_DEVICE_INFO_NAME, &DevNameLength);
+    DevName.resize(DevNameLength);
+    olGetDeviceInfo(reinterpret_cast<ol_device_handle_t>(phDevices[0]),
+                    OL_DEVICE_INFO_NAME, DevNameLength, DevName.data());
+
+    auto Res = Parser->extract(DevName, RealBinary, RealLength);
+    if (Res != UR_RESULT_SUCCESS) {
+      return Res;
+    }
+  }
+
   if (PlatformBackend == UR_PLATFORM_BACKEND_CUDA) {
-    return ProgramCreateCudaWorkaround(hContext, ppBinaries[0], pLengths[0],
+    return ProgramCreateCudaWorkaround(hContext, RealBinary, RealLength,
                                        phProgram);
   }
 
-  auto *RealBinary = const_cast<uint8_t *>(ppBinaries[0]);
-
   ur_program_handle_t Program = new ur_program_handle_t_();
   auto Res =
       olCreateProgram(reinterpret_cast<ol_device_handle_t>(hContext->Device),
-                      RealBinary, pLengths[0], &Program->OffloadProgram);
+                      RealBinary, RealLength, &Program->OffloadProgram);
 
   if (Res != OL_SUCCESS) {
     delete Program;

From d2c52a46213af0a87fcc4720cfce42b9f84f8f9d Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross@codeplay.com>
Date: Tue, 29 Apr 2025 12:25:57 +0100
Subject: [PATCH 06/11] Use proper handle types

---
 .../source/adapters/offload/adapter.cpp       | 32 ++++++++----
 .../source/adapters/offload/adapter.hpp       |  6 ++-
 .../source/adapters/offload/common.hpp        |  6 ++-
 .../source/adapters/offload/context.hpp       | 13 ++---
 .../source/adapters/offload/device.cpp        | 49 ++++++-------------
 .../source/adapters/offload/device.hpp        | 14 ++++++
 .../source/adapters/offload/platform.cpp      | 14 +++---
 .../source/adapters/offload/platform.hpp      | 13 +++++
 .../source/adapters/offload/program.cpp       | 11 ++---
 .../source/adapters/offload/queue.cpp         |  6 +--
 .../source/adapters/offload/usm.cpp           | 13 ++---
 11 files changed, 98 insertions(+), 79 deletions(-)
 create mode 100644 unified-runtime/source/adapters/offload/device.hpp
 create mode 100644 unified-runtime/source/adapters/offload/platform.hpp

diff --git a/unified-runtime/source/adapters/offload/adapter.cpp b/unified-runtime/source/adapters/offload/adapter.cpp
index 6299bd3280de1..9371e6684ca9e 100644
--- a/unified-runtime/source/adapters/offload/adapter.cpp
+++ b/unified-runtime/source/adapters/offload/adapter.cpp
@@ -8,12 +8,14 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include <OffloadAPI.h>
 #include <atomic>
 #include <cstdint>
-#include <OffloadAPI.h>
 #include <unordered_set>
 
 #include "adapter.hpp"
+#include "device.hpp"
+#include "platform.hpp"
 #include "ur/ur.hpp"
 #include "ur_api.h"
 
@@ -23,14 +25,15 @@ ur_adapter_handle_t_ Adapter{};
 ur_result_t ur_adapter_handle_t_::init() {
   auto Res = olInit();
 
-  // Discover every platform that isn't the host platform.
-  // Use an unordered_set to deduplicate platforms we discover multiple times
-  // from different devices.
-  // Also discover the host device. We only expect one so don't need to worry
-  // about overwriting it.
+  struct InitUserData {
+    std::unordered_map<ol_platform_handle_t, ur_platform_handle_t> TempMap;
+  } InitUserData{{}};
+
+  // Discover every platform and device
   Res = olIterateDevices(
       [](ol_device_handle_t D, void *UserData) {
-        auto Adapter = static_cast<ur_adapter_handle_t>(UserData);
+        auto *Data = reinterpret_cast<decltype(InitUserData) *>(UserData);
+
         ol_platform_handle_t Platform;
         olGetDeviceInfo(D, OL_DEVICE_INFO_PLATFORM, sizeof(Platform),
                         &Platform);
@@ -38,13 +41,22 @@ ur_result_t ur_adapter_handle_t_::init() {
         olGetPlatformInfo(Platform, OL_PLATFORM_INFO_BACKEND, sizeof(Backend),
                           &Backend);
         if (Backend == OL_PLATFORM_BACKEND_HOST) {
-          Adapter->HostDevice = D;
+          Adapter.HostDevice = D;
         } else if (Backend != OL_PLATFORM_BACKEND_UNKNOWN) {
-          Adapter->Platforms.insert(Platform);
+          ur_platform_handle_t UrPlatform;
+          if (!Data->TempMap.count(Platform)) {
+            Adapter.Platforms.push_back(ur_platform_handle_t_{Platform});
+            UrPlatform = &Adapter.Platforms.back();
+            Data->TempMap.insert({Platform, UrPlatform});
+          } else {
+            UrPlatform = Data->TempMap[Platform];
+          }
+
+          UrPlatform->Devices.push_back(ur_device_handle_t_{UrPlatform, D});
         }
         return false;
       },
-      this);
+      &InitUserData);
 
   (void)Res;
 
diff --git a/unified-runtime/source/adapters/offload/adapter.hpp b/unified-runtime/source/adapters/offload/adapter.hpp
index be36aceaa8410..a9fd927b55785 100644
--- a/unified-runtime/source/adapters/offload/adapter.hpp
+++ b/unified-runtime/source/adapters/offload/adapter.hpp
@@ -6,13 +6,15 @@
 
 #include <OffloadAPI.h>
 
+#include "common.hpp"
 #include "logger/ur_logger.hpp"
+#include "platform.hpp"
 
-struct ur_adapter_handle_t_ {
+struct ur_adapter_handle_t_ : ur::offload::handle_base {
   std::atomic_uint32_t RefCount = 0;
   logger::Logger &Logger = logger::get_logger("offload");
   ol_device_handle_t HostDevice = nullptr;
-  std::unordered_set<ol_platform_handle_t> Platforms;
+  std::vector<ur_platform_handle_t_> Platforms;
 
   ur_result_t init();
 };
diff --git a/unified-runtime/source/adapters/offload/common.hpp b/unified-runtime/source/adapters/offload/common.hpp
index 69aa6bff11e9f..152714bdc2cc5 100644
--- a/unified-runtime/source/adapters/offload/common.hpp
+++ b/unified-runtime/source/adapters/offload/common.hpp
@@ -2,6 +2,10 @@
 
 #include <atomic>
 
-struct RefCounted {
+namespace ur::offload {
+struct handle_base {};
+} // namespace ur::offload
+
+struct RefCounted : ur::offload::handle_base {
   std::atomic_uint32_t RefCount = 1;
 };
diff --git a/unified-runtime/source/adapters/offload/context.hpp b/unified-runtime/source/adapters/offload/context.hpp
index 9483ec1b4a8b8..ce43d428cd0c7 100644
--- a/unified-runtime/source/adapters/offload/context.hpp
+++ b/unified-runtime/source/adapters/offload/context.hpp
@@ -1,19 +1,16 @@
 #pragma once
 
-#include <atomic>
+#include "common.hpp"
+#include <OffloadAPI.h>
 #include <unordered_map>
 #include <ur_api.h>
-#include <OffloadAPI.h>
 
-struct ur_context_handle_t_ {
+struct ur_context_handle_t_ : RefCounted {
   ur_context_handle_t_(ur_device_handle_t hDevice) : Device{hDevice} {
     urDeviceRetain(Device);
   }
-  ~ur_context_handle_t_() {
-    urDeviceRelease(Device);
-  }
+  ~ur_context_handle_t_() { urDeviceRelease(Device); }
 
   ur_device_handle_t Device;
-  std::atomic_uint32_t RefCount;
-  std::unordered_map<void*, ol_alloc_type_t> AllocTypeMap;
+  std::unordered_map<void *, ol_alloc_type_t> AllocTypeMap;
 };
diff --git a/unified-runtime/source/adapters/offload/device.cpp b/unified-runtime/source/adapters/offload/device.cpp
index 54cc60c6fe110..d67f3555cf640 100644
--- a/unified-runtime/source/adapters/offload/device.cpp
+++ b/unified-runtime/source/adapters/offload/device.cpp
@@ -2,6 +2,8 @@
 #include <ur/ur.hpp>
 #include <ur_api.h>
 
+#include "device.hpp"
+#include "platform.hpp"
 #include "ur2offload.hpp"
 
 UR_APIEXPORT ur_result_t UR_APICALL urDeviceGet(ur_platform_handle_t hPlatform,
@@ -9,38 +11,17 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGet(ur_platform_handle_t hPlatform,
                                                 uint32_t NumEntries,
                                                 ur_device_handle_t *phDevices,
                                                 uint32_t *pNumDevices) {
+  if (pNumDevices) {
+    *pNumDevices = static_cast<uint32_t>(hPlatform->Devices.size());
+  }
 
-  uint32_t NumDevices = 0;
-  // Pass a few things to the callback (we can't use a lambda with captures)
-  using ParamsT = struct {
-    uint32_t DeviceLimit;
-    uint32_t &NumDevices;
-    ol_platform_handle_t Platform;
-    ol_device_handle_t *DevicesOut;
-  };
-  ParamsT Params = {NumEntries, NumDevices,
-                    reinterpret_cast<ol_platform_handle_t>(hPlatform),
-                    reinterpret_cast<ol_device_handle_t *>(phDevices)};
-
-  olIterateDevices(
-      [](ol_device_handle_t D, void *Data) {
-        auto Params = reinterpret_cast<ParamsT *>(Data);
-        ol_platform_handle_t Platform = nullptr;
-        olGetDeviceInfo(D, OL_DEVICE_INFO_PLATFORM, sizeof(Platform),
-                        &Platform);
-        if (Platform == Params->Platform) {
-          if (Params->DevicesOut) {
-            Params->DevicesOut[Params->NumDevices] = D;
-          }
-          Params->NumDevices++;
-        }
-        return Params->NumDevices == Params->DeviceLimit;
-      },
-      &Params);
+  size_t NumDevices =
+      std::min(static_cast<uint32_t>(hPlatform->Devices.size()), NumEntries);
 
-  if (pNumDevices) {
-    *pNumDevices = NumDevices;
+  for (size_t I = 0; I < NumDevices; I++) {
+    phDevices[I] = &hPlatform->Devices[I];
   }
+
   return UR_RESULT_SUCCESS;
 }
 
@@ -74,7 +55,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
     olInfo = OL_DEVICE_INFO_DRIVER_VERSION;
     break;
   case UR_DEVICE_INFO_PLATFORM:
-    olInfo = OL_DEVICE_INFO_PLATFORM;
+    return ReturnValue(hDevice->Platform);
     break;
   case UR_DEVICE_INFO_USM_SINGLE_SHARED_SUPPORT:
     return ReturnValue(UR_DEVICE_USM_ACCESS_CAPABILITY_FLAG_ACCESS);
@@ -86,16 +67,14 @@ UR_APIEXPORT ur_result_t UR_APICALL urDeviceGetInfo(ur_device_handle_t hDevice,
 
   if (pPropSizeRet) {
     if (auto Res =
-            olGetDeviceInfoSize(reinterpret_cast<ol_device_handle_t>(hDevice),
-                                olInfo, pPropSizeRet)) {
+            olGetDeviceInfoSize(hDevice->OffloadDevice, olInfo, pPropSizeRet)) {
       return offloadResultToUR(Res);
     }
   }
 
   if (pPropValue) {
-    if (auto Res =
-            olGetDeviceInfo(reinterpret_cast<ol_device_handle_t>(hDevice),
-                            olInfo, propSize, pPropValue)) {
+    if (auto Res = olGetDeviceInfo(hDevice->OffloadDevice, olInfo, propSize,
+                                   pPropValue)) {
       return offloadResultToUR(Res);
     }
     // Need to explicitly map this type
diff --git a/unified-runtime/source/adapters/offload/device.hpp b/unified-runtime/source/adapters/offload/device.hpp
new file mode 100644
index 0000000000000..b1fc24792f3b4
--- /dev/null
+++ b/unified-runtime/source/adapters/offload/device.hpp
@@ -0,0 +1,14 @@
+#pragma once
+
+#include "common.hpp"
+#include <OffloadAPI.h>
+#include <ur_api.h>
+
+struct ur_device_handle_t_ : ur::offload::handle_base {
+  ur_device_handle_t_(ur_platform_handle_t Platform,
+                      ol_device_handle_t OffloadDevice)
+      : handle_base(), Platform(Platform), OffloadDevice(OffloadDevice) {}
+
+  ur_platform_handle_t Platform;
+  ol_device_handle_t OffloadDevice;
+};
diff --git a/unified-runtime/source/adapters/offload/platform.cpp b/unified-runtime/source/adapters/offload/platform.cpp
index 02b992c96351d..133a653baa7ca 100644
--- a/unified-runtime/source/adapters/offload/platform.cpp
+++ b/unified-runtime/source/adapters/offload/platform.cpp
@@ -4,6 +4,7 @@
 #include <ur_api.h>
 
 #include "adapter.hpp"
+#include "device.hpp"
 #include "ur2offload.hpp"
 
 UR_APIEXPORT ur_result_t UR_APICALL
@@ -17,8 +18,7 @@ urPlatformGet(ur_adapter_handle_t, uint32_t NumEntries,
   if (phPlatforms) {
     size_t PlatformIndex = 0;
     for (auto &Platform : Adapter.Platforms) {
-      phPlatforms[PlatformIndex++] =
-          reinterpret_cast<ur_platform_handle_t>(Platform);
+      phPlatforms[PlatformIndex++] = &Platform;
       if (PlatformIndex == NumEntries) {
         break;
       }
@@ -56,17 +56,15 @@ urPlatformGetInfo(ur_platform_handle_t hPlatform, ur_platform_info_t propName,
   }
 
   if (pPropSizeRet) {
-    if (auto Res = olGetPlatformInfoSize(
-            reinterpret_cast<ol_platform_handle_t>(hPlatform), olInfo,
-            pPropSizeRet)) {
+    if (auto Res = olGetPlatformInfoSize(hPlatform->OffloadPlatform, olInfo,
+                                         pPropSizeRet)) {
       return offloadResultToUR(Res);
     }
   }
 
   if (pPropValue) {
-    if (auto Res =
-            olGetPlatformInfo(reinterpret_cast<ol_platform_handle_t>(hPlatform),
-                              olInfo, propSize, pPropValue)) {
+    if (auto Res = olGetPlatformInfo(hPlatform->OffloadPlatform, olInfo,
+                                     propSize, pPropValue)) {
       return offloadResultToUR(Res);
     }
   }
diff --git a/unified-runtime/source/adapters/offload/platform.hpp b/unified-runtime/source/adapters/offload/platform.hpp
new file mode 100644
index 0000000000000..100e103998364
--- /dev/null
+++ b/unified-runtime/source/adapters/offload/platform.hpp
@@ -0,0 +1,13 @@
+#pragma once
+
+#include "common.hpp"
+#include <ur_api.h>
+#include <OffloadAPI.h>
+#include <vector>
+
+struct ur_platform_handle_t_ : ur::offload::handle_base {
+  ur_platform_handle_t_(ol_platform_handle_t OffloadPlatform) : handle_base(), OffloadPlatform(OffloadPlatform) {};
+
+  ol_platform_handle_t OffloadPlatform;
+  std::vector<ur_device_handle_t_> Devices;
+};
diff --git a/unified-runtime/source/adapters/offload/program.cpp b/unified-runtime/source/adapters/offload/program.cpp
index acafef8b73d66..5425cbed42095 100644
--- a/unified-runtime/source/adapters/offload/program.cpp
+++ b/unified-runtime/source/adapters/offload/program.cpp
@@ -3,6 +3,7 @@
 #include <ur_api.h>
 
 #include "context.hpp"
+#include "device.hpp"
 #include "program.hpp"
 #include "ur2offload.hpp"
 
@@ -33,9 +34,8 @@ ur_result_t ProgramCreateCudaWorkaround(ur_context_handle_t hContext,
   fprintf(stderr, "Performed CUDA bin workaround (size = %lu)\n", RealLength);
 
   ur_program_handle_t Program = new ur_program_handle_t_();
-  auto Res =
-      olCreateProgram(reinterpret_cast<ol_device_handle_t>(hContext->Device),
-                      RealBinary, RealLength, &Program->OffloadProgram);
+  auto Res = olCreateProgram(hContext->Device->OffloadDevice, RealBinary,
+                             RealLength, &Program->OffloadProgram);
 
   // Program owns the linked module now
   cuLinkDestroy(State);
@@ -177,9 +177,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary(
   }
 
   ur_program_handle_t Program = new ur_program_handle_t_();
-  auto Res =
-      olCreateProgram(reinterpret_cast<ol_device_handle_t>(hContext->Device),
-                      RealBinary, RealLength, &Program->OffloadProgram);
+  auto Res = olCreateProgram(hContext->Device->OffloadDevice, RealBinary,
+                             RealLength, &Program->OffloadProgram);
 
   if (Res != OL_SUCCESS) {
     delete Program;
diff --git a/unified-runtime/source/adapters/offload/queue.cpp b/unified-runtime/source/adapters/offload/queue.cpp
index 2900deab68c60..32adb1f512c11 100644
--- a/unified-runtime/source/adapters/offload/queue.cpp
+++ b/unified-runtime/source/adapters/offload/queue.cpp
@@ -3,6 +3,7 @@
 #include <ur_api.h>
 
 #include "context.hpp"
+#include "device.hpp"
 #include "queue.hpp"
 #include "ur2offload.hpp"
 
@@ -13,14 +14,13 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueCreate(
   assert(hContext->Device == hDevice);
 
   ur_queue_handle_t Queue = new ur_queue_handle_t_();
-  auto Res = olCreateQueue(reinterpret_cast<ol_device_handle_t>(hDevice),
-                           &Queue->OffloadQueue);
+  auto Res = olCreateQueue(hDevice->OffloadDevice, &Queue->OffloadQueue);
   if (Res != OL_SUCCESS) {
     delete Queue;
     return offloadResultToUR(Res);
   }
 
-  Queue->OffloadDevice = reinterpret_cast<ol_device_handle_t>(hDevice);
+  Queue->OffloadDevice = hDevice->OffloadDevice;
 
   *phQueue = Queue;
 
diff --git a/unified-runtime/source/adapters/offload/usm.cpp b/unified-runtime/source/adapters/offload/usm.cpp
index a597cf87c21b1..f603516811f0a 100644
--- a/unified-runtime/source/adapters/offload/usm.cpp
+++ b/unified-runtime/source/adapters/offload/usm.cpp
@@ -3,14 +3,15 @@
 #include <ur_api.h>
 
 #include "context.hpp"
+#include "device.hpp"
 #include "ur2offload.hpp"
 
 UR_APIEXPORT ur_result_t UR_APICALL urUSMHostAlloc(ur_context_handle_t hContext,
                                                    const ur_usm_desc_t *,
                                                    ur_usm_pool_handle_t,
                                                    size_t size, void **ppMem) {
-  auto Res = olMemAlloc(reinterpret_cast<ol_device_handle_t>(hContext->Device),
-                        OL_ALLOC_TYPE_HOST, size, ppMem);
+  auto Res = olMemAlloc(hContext->Device->OffloadDevice, OL_ALLOC_TYPE_HOST,
+                        size, ppMem);
 
   if (Res != OL_SUCCESS) {
     return offloadResultToUR(Res);
@@ -23,8 +24,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMHostAlloc(ur_context_handle_t hContext,
 UR_APIEXPORT ur_result_t UR_APICALL urUSMDeviceAlloc(
     ur_context_handle_t hContext, ur_device_handle_t, const ur_usm_desc_t *,
     ur_usm_pool_handle_t, size_t size, void **ppMem) {
-  auto Res = olMemAlloc(reinterpret_cast<ol_device_handle_t>(hContext->Device),
-                        OL_ALLOC_TYPE_DEVICE, size, ppMem);
+  auto Res = olMemAlloc(hContext->Device->OffloadDevice, OL_ALLOC_TYPE_DEVICE,
+                        size, ppMem);
 
   if (Res != OL_SUCCESS) {
     return offloadResultToUR(Res);
@@ -37,8 +38,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urUSMDeviceAlloc(
 UR_APIEXPORT ur_result_t UR_APICALL urUSMSharedAlloc(
     ur_context_handle_t hContext, ur_device_handle_t, const ur_usm_desc_t *,
     ur_usm_pool_handle_t, size_t size, void **ppMem) {
-  auto Res = olMemAlloc(reinterpret_cast<ol_device_handle_t>(hContext->Device),
-                        OL_ALLOC_TYPE_MANAGED, size, ppMem);
+  auto Res = olMemAlloc(hContext->Device->OffloadDevice, OL_ALLOC_TYPE_MANAGED,
+                        size, ppMem);
 
   if (Res != OL_SUCCESS) {
     return offloadResultToUR(Res);

From 4e28f8275f85bf2e1e22849358ff1733fa3aa506 Mon Sep 17 00:00:00 2001
From: Ross Brunton <ross@codeplay.com>
Date: Wed, 30 Apr 2025 14:47:14 +0100
Subject: [PATCH 07/11] Fix broken cast

---
 unified-runtime/source/adapters/offload/program.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/unified-runtime/source/adapters/offload/program.cpp b/unified-runtime/source/adapters/offload/program.cpp
index 5425cbed42095..6384d7dd3ced2 100644
--- a/unified-runtime/source/adapters/offload/program.cpp
+++ b/unified-runtime/source/adapters/offload/program.cpp
@@ -159,11 +159,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary(
   if (auto Parser = HipOffloadBundleParser::load(RealBinary, RealLength)) {
     std::string DevName{};
     size_t DevNameLength;
-    olGetDeviceInfoSize(reinterpret_cast<ol_device_handle_t>(phDevices[0]),
-                        OL_DEVICE_INFO_NAME, &DevNameLength);
+    olGetDeviceInfoSize(phDevices[0]->OffloadDevice, OL_DEVICE_INFO_NAME,
+                        &DevNameLength);
     DevName.resize(DevNameLength);
-    olGetDeviceInfo(reinterpret_cast<ol_device_handle_t>(phDevices[0]),
-                    OL_DEVICE_INFO_NAME, DevNameLength, DevName.data());
+    olGetDeviceInfo(phDevices[0]->OffloadDevice, OL_DEVICE_INFO_NAME,
+                    DevNameLength, DevName.data());
 
     auto Res = Parser->extract(DevName, RealBinary, RealLength);
     if (Res != UR_RESULT_SUCCESS) {

From a257d916e7e1a7df740bc8cfd29b06fb3d1ef7a4 Mon Sep 17 00:00:00 2001
From: Callum Fare <callum@codeplay.com>
Date: Tue, 29 Apr 2025 17:30:15 +0100
Subject: [PATCH 08/11] Fix Offload build on CUDA and detect correct targets in
 the CTS

---
 .../source/adapters/offload/CMakeLists.txt    | 20 ++++++++++--------
 .../source/adapters/offload/device.cpp        | 19 ++++++++++-------
 .../source/adapters/offload/program.cpp       | 17 +++++++--------
 .../test/conformance/source/environment.cpp   | 21 +++++++++++++------
 4 files changed, 46 insertions(+), 31 deletions(-)

diff --git a/unified-runtime/source/adapters/offload/CMakeLists.txt b/unified-runtime/source/adapters/offload/CMakeLists.txt
index 6461fabe647b2..bbef36f3ff8d2 100644
--- a/unified-runtime/source/adapters/offload/CMakeLists.txt
+++ b/unified-runtime/source/adapters/offload/CMakeLists.txt
@@ -18,8 +18,8 @@ if (NOT TARGET cudadrv)
         add_library(cudadrv SHARED IMPORTED GLOBAL)
         set_target_properties(
                 cudadrv PROPERTIES 
-                IMPORTED_LOCATION             ${CUDA_CUDA_LIBRARY}
-                INTERFACE_INCLUDE_DIRECTORIES ${CUDA_INCLUDE_DIRS}
+                IMPORTED_LOCATION             ${CUDA_cuda_driver_LIBRARY}
+                INTERFACE_INCLUDE_DIRECTORIES ${CUDAToolkit_INCLUDE_DIRS}
         )
 endif()
 
@@ -44,20 +44,22 @@ set_target_properties(${TARGET_NAME} PROPERTIES
         SOVERSION "${PROJECT_VERSION_MAJOR}"
 )
 
+set(ADDITIONAL_LINK_LIBS "")
+if (CUDA_cuda_driver_LIBRARY)
+    list(APPEND ADDITIONAL_LINK_LIBS
+        cudadrv
+    )
+    target_compile_definitions(${TARGET_NAME} PRIVATE UR_CUDA_ENABLED)
+endif()
+
 target_link_libraries(${TARGET_NAME} PRIVATE
         ${PROJECT_NAME}::headers
         ${PROJECT_NAME}::common
         ${PROJECT_NAME}::umf
         ${UR_OFFLOAD_INSTALL_DIR}/lib/libLLVMOffload.so
+        ${ADDITIONAL_LINK_LIBS}
 )
 
-if (CUDA_CUDA_LIBRARY)
-    target_link_libraries(${TARGET_NAME}
-        cudadrv
-    )
-    target_compile_definitions(${TARGET_NAME} PRIVATE UR_CUDA_ENABLED=1)
-endif()
-
 target_include_directories(${TARGET_NAME} PRIVATE
         "${UR_OFFLOAD_INCLUDE_DIR}/offload"
         "${CMAKE_CURRENT_SOURCE_DIR}/../../"
diff --git a/unified-runtime/source/adapters/offload/device.cpp b/unified-runtime/source/adapters/offload/device.cpp
index d67f3555cf640..ec5929577b8dc 100644
--- a/unified-runtime/source/adapters/offload/device.cpp
+++ b/unified-runtime/source/adapters/offload/device.cpp
@@ -119,15 +119,20 @@ urDevicePartition(ur_device_handle_t, const ur_device_partition_properties_t *,
 UR_APIEXPORT ur_result_t UR_APICALL urDeviceSelectBinary(
     ur_device_handle_t hDevice, const ur_device_binary_t *pBinaries,
     uint32_t NumBinaries, uint32_t *pSelectedBinary) {
-  std::ignore = hDevice;
-  std::ignore = pBinaries;
-  std::ignore = NumBinaries;
-  std::ignore = pSelectedBinary;
 
-  // TODO: Don't hard code nvptx64!
-  const char *image_target = UR_DEVICE_BINARY_TARGET_NVPTX64;
+  ol_platform_backend_t Backend;
+  olGetPlatformInfo(hDevice->Platform->OffloadPlatform,
+                    OL_PLATFORM_INFO_BACKEND, sizeof(Backend), &Backend);
+
+  const char *ImageTarget = UR_DEVICE_BINARY_TARGET_UNKNOWN;
+  if (Backend == OL_PLATFORM_BACKEND_CUDA) {
+    ImageTarget = UR_DEVICE_BINARY_TARGET_NVPTX64;
+  } else if (Backend == OL_PLATFORM_BACKEND_AMDGPU) {
+    ImageTarget = UR_DEVICE_BINARY_TARGET_AMDGCN;
+  }
+
   for (uint32_t i = 0; i < NumBinaries; ++i) {
-    if (strcmp(pBinaries[i].pDeviceTargetSpec, image_target) == 0) {
+    if (strcmp(pBinaries[i].pDeviceTargetSpec, ImageTarget) == 0) {
       *pSelectedBinary = i;
       return UR_RESULT_SUCCESS;
     }
diff --git a/unified-runtime/source/adapters/offload/program.cpp b/unified-runtime/source/adapters/offload/program.cpp
index 6384d7dd3ced2..8489b3c5b0a7e 100644
--- a/unified-runtime/source/adapters/offload/program.cpp
+++ b/unified-runtime/source/adapters/offload/program.cpp
@@ -4,6 +4,7 @@
 
 #include "context.hpp"
 #include "device.hpp"
+#include "platform.hpp"
 #include "program.hpp"
 #include "ur2offload.hpp"
 
@@ -31,7 +32,10 @@ ur_result_t ProgramCreateCudaWorkaround(ur_context_handle_t hContext,
   cuLinkComplete(State, &CuBin, &CuBinSize);
   RealBinary = (uint8_t *)CuBin;
   RealLength = CuBinSize;
+
+#if 0
   fprintf(stderr, "Performed CUDA bin workaround (size = %lu)\n", RealLength);
+#endif
 
   ur_program_handle_t Program = new ur_program_handle_t_();
   auto Res = olCreateProgram(hContext->Device->OffloadDevice, RealBinary,
@@ -39,7 +43,6 @@ ur_result_t ProgramCreateCudaWorkaround(ur_context_handle_t hContext,
 
   // Program owns the linked module now
   cuLinkDestroy(State);
-  (void)State;
 
   if (Res != OL_SUCCESS) {
     delete Program;
@@ -146,13 +149,6 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary(
     return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
   }
 
-  ur_platform_handle_t DevicePlatform;
-  urDeviceGetInfo(phDevices[0], UR_DEVICE_INFO_PLATFORM,
-                  sizeof(ur_platform_handle_t), &DevicePlatform, nullptr);
-  ur_platform_backend_t PlatformBackend;
-  urPlatformGetInfo(DevicePlatform, UR_PLATFORM_INFO_BACKEND,
-                    sizeof(ur_platform_backend_t), &PlatformBackend, nullptr);
-
   auto *RealBinary = ppBinaries[0];
   size_t RealLength = pLengths[0];
 
@@ -171,7 +167,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urProgramCreateWithBinary(
     }
   }
 
-  if (PlatformBackend == UR_PLATFORM_BACKEND_CUDA) {
+  ol_platform_backend_t Backend;
+  olGetPlatformInfo(phDevices[0]->Platform->OffloadPlatform,
+                    OL_PLATFORM_INFO_BACKEND, sizeof(Backend), &Backend);
+  if (Backend == OL_PLATFORM_BACKEND_CUDA) {
     return ProgramCreateCudaWorkaround(hContext, RealBinary, RealLength,
                                        phProgram);
   }
diff --git a/unified-runtime/test/conformance/source/environment.cpp b/unified-runtime/test/conformance/source/environment.cpp
index d2f63c37929d8..8298f662dcc51 100644
--- a/unified-runtime/test/conformance/source/environment.cpp
+++ b/unified-runtime/test/conformance/source/environment.cpp
@@ -216,14 +216,23 @@ std::string KernelsEnvironment::getTargetName(ur_platform_handle_t platform) {
   case UR_PLATFORM_BACKEND_HIP:
     return "amdgcn-amd-amdhsa";
   case UR_PLATFORM_BACKEND_OFFLOAD: {
-    // TODO: In future this should use urDeviceSelectBinary
-    auto result = ur_getenv("UR_OFFLOAD_TARGET_NAME");
-    if (!result) {
-      error = "For offload testing, please specify a target in "
-              "`UR_OFFLOAD_TARGET_NAME`";
+    // All Offload platforms report this backend, use the platform name to select
+    // the actual underlying backend.
+    std::vector<char> PlatformName;
+    size_t PlatformNameSize = 0;
+    urPlatformGetInfo(platform, UR_PLATFORM_INFO_NAME, 0, nullptr,
+                      &PlatformNameSize);
+    PlatformName.resize(PlatformNameSize);
+    urPlatformGetInfo(platform, UR_PLATFORM_INFO_NAME, PlatformNameSize,
+                      PlatformName.data(), nullptr);
+    if (std::strcmp(PlatformName.data(), "CUDA") == 0) {
+      return "nvptx64-nvidia-cuda";
+    } else if (std::strcmp(PlatformName.data(), "AMDGPU") == 0) {
+      return "amdgcn-amd-amdhsa";
+    } else {
+      error = "Could not detect target for Offload platform";
       return {};
     }
-    return *result;
   }
   case UR_PLATFORM_BACKEND_NATIVE_CPU:
     error = "native_cpu doesn't support kernel tests yet";

From 3794a92cacaa5d4f0192485acd0f0d1e7daa763a Mon Sep 17 00:00:00 2001
From: Callum Fare <callum@codeplay.com>
Date: Wed, 30 Apr 2025 17:36:25 +0100
Subject: [PATCH 09/11] Add missing license text

---
 .../source/adapters/offload/CMakeLists.txt      |  5 +++++
 .../source/adapters/offload/adapter.cpp         |  2 +-
 .../source/adapters/offload/adapter.hpp         | 10 ++++++++++
 .../source/adapters/offload/common.hpp          | 10 ++++++++++
 .../source/adapters/offload/context.cpp         | 10 ++++++++++
 .../source/adapters/offload/context.hpp         | 10 ++++++++++
 .../source/adapters/offload/device.cpp          | 10 ++++++++++
 .../source/adapters/offload/device.hpp          | 10 ++++++++++
 .../source/adapters/offload/enqueue.cpp         | 14 ++++++++++++--
 .../source/adapters/offload/event.cpp           | 10 ++++++++++
 .../source/adapters/offload/event.hpp           | 14 ++++++++++++--
 .../source/adapters/offload/kernel.cpp          | 17 +++++++++++++----
 .../source/adapters/offload/kernel.hpp          | 10 ++++++++++
 .../source/adapters/offload/platform.cpp        | 10 ++++++++++
 .../source/adapters/offload/platform.hpp        | 15 +++++++++++++--
 .../source/adapters/offload/program.cpp         | 10 ++++++++++
 .../source/adapters/offload/program.hpp         | 14 ++++++++++++--
 .../source/adapters/offload/queue.cpp           | 10 ++++++++++
 .../source/adapters/offload/queue.hpp           | 16 +++++++++++++---
 unified-runtime/source/adapters/offload/usm.cpp | 10 ++++++++++
 unified-runtime/source/loader/ur_lib.cpp        |  2 +-
 21 files changed, 202 insertions(+), 17 deletions(-)

diff --git a/unified-runtime/source/adapters/offload/CMakeLists.txt b/unified-runtime/source/adapters/offload/CMakeLists.txt
index bbef36f3ff8d2..dfcafc0fa98e5 100644
--- a/unified-runtime/source/adapters/offload/CMakeLists.txt
+++ b/unified-runtime/source/adapters/offload/CMakeLists.txt
@@ -1,3 +1,8 @@
+# Copyright (C) 2025 Intel Corporation
+# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+# See LICENSE.TXT
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
 set(UR_OFFLOAD_ADAPTER_DIR "${CMAKE_CURRENT_SOURCE_DIR}" CACHE PATH "Offload adapter directory") # TODO
 
 set(TARGET_NAME ur_adapter_offload)
diff --git a/unified-runtime/source/adapters/offload/adapter.cpp b/unified-runtime/source/adapters/offload/adapter.cpp
index 9371e6684ca9e..fd3608dcacb0c 100644
--- a/unified-runtime/source/adapters/offload/adapter.cpp
+++ b/unified-runtime/source/adapters/offload/adapter.cpp
@@ -1,4 +1,4 @@
-//===----------- adapter.cpp - LLVM Offload Plugin  -----------------------===//
+//===----------- adapter.cpp - LLVM Offload Adapter  ----------------------===//
 //
 // Copyright (C) 2024 Intel Corporation
 //
diff --git a/unified-runtime/source/adapters/offload/adapter.hpp b/unified-runtime/source/adapters/offload/adapter.hpp
index a9fd927b55785..b85995b0f6a08 100644
--- a/unified-runtime/source/adapters/offload/adapter.hpp
+++ b/unified-runtime/source/adapters/offload/adapter.hpp
@@ -1,3 +1,13 @@
+//===----------- adapter.hpp - LLVM Offload Adapter  ----------------------===//
+//
+// Copyright (C) 2025 Intel Corporation
+//
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
+// Exceptions. See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
 #pragma once
 
 #include <atomic>
diff --git a/unified-runtime/source/adapters/offload/common.hpp b/unified-runtime/source/adapters/offload/common.hpp
index 152714bdc2cc5..2159f9ae993a1 100644
--- a/unified-runtime/source/adapters/offload/common.hpp
+++ b/unified-runtime/source/adapters/offload/common.hpp
@@ -1,3 +1,13 @@
+//===----------- common.hpp - LLVM Offload Adapter  -----------------------===//
+//
+// Copyright (C) 2024 Intel Corporation
+//
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
+// Exceptions. See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
 #pragma once
 
 #include <atomic>
diff --git a/unified-runtime/source/adapters/offload/context.cpp b/unified-runtime/source/adapters/offload/context.cpp
index 01d015038c3b1..5e76ab5abb256 100644
--- a/unified-runtime/source/adapters/offload/context.cpp
+++ b/unified-runtime/source/adapters/offload/context.cpp
@@ -1,3 +1,13 @@
+//===----------- context.cpp - LLVM Offload Adapter  ----------------------===//
+//
+// Copyright (C) 2025 Intel Corporation
+//
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
+// Exceptions. See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
 #include "context.hpp"
 #include <ur_api.h>
 
diff --git a/unified-runtime/source/adapters/offload/context.hpp b/unified-runtime/source/adapters/offload/context.hpp
index ce43d428cd0c7..64727ce3338bb 100644
--- a/unified-runtime/source/adapters/offload/context.hpp
+++ b/unified-runtime/source/adapters/offload/context.hpp
@@ -1,3 +1,13 @@
+//===----------- context.hpp - LLVM Offload Adapter  ----------------------===//
+//
+// Copyright (C) 2025 Intel Corporation
+//
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
+// Exceptions. See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
 #pragma once
 
 #include "common.hpp"
diff --git a/unified-runtime/source/adapters/offload/device.cpp b/unified-runtime/source/adapters/offload/device.cpp
index ec5929577b8dc..2dfa7d05ed3fe 100644
--- a/unified-runtime/source/adapters/offload/device.cpp
+++ b/unified-runtime/source/adapters/offload/device.cpp
@@ -1,3 +1,13 @@
+//===----------- device.cpp - LLVM Offload Adapter  -----------------------===//
+//
+// Copyright (C) 2025 Intel Corporation
+//
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
+// Exceptions. See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
 #include <OffloadAPI.h>
 #include <ur/ur.hpp>
 #include <ur_api.h>
diff --git a/unified-runtime/source/adapters/offload/device.hpp b/unified-runtime/source/adapters/offload/device.hpp
index b1fc24792f3b4..1f616745384e8 100644
--- a/unified-runtime/source/adapters/offload/device.hpp
+++ b/unified-runtime/source/adapters/offload/device.hpp
@@ -1,3 +1,13 @@
+//===----------- device.hpp - LLVM Offload Adapter  -----------------------===//
+//
+// Copyright (C) 2025 Intel Corporation
+//
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
+// Exceptions. See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
 #pragma once
 
 #include "common.hpp"
diff --git a/unified-runtime/source/adapters/offload/enqueue.cpp b/unified-runtime/source/adapters/offload/enqueue.cpp
index 30f5a099429fa..7ec26a3b25ea0 100644
--- a/unified-runtime/source/adapters/offload/enqueue.cpp
+++ b/unified-runtime/source/adapters/offload/enqueue.cpp
@@ -1,3 +1,13 @@
+//===----------- enqueue.cpp - LLVM Offload Adapter  ----------------------===//
+//
+// Copyright (C) 2025 Intel Corporation
+//
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
+// Exceptions. See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
 #include <OffloadAPI.h>
 #include <assert.h>
 #include <ur_api.h>
@@ -17,8 +27,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
   (void)phEventWaitList;
   //
 
-  (void) pGlobalWorkOffset;
-  (void) pLocalWorkSize;
+  (void)pGlobalWorkOffset;
+  (void)pLocalWorkSize;
 
   assert(workDim == 1);
 
diff --git a/unified-runtime/source/adapters/offload/event.cpp b/unified-runtime/source/adapters/offload/event.cpp
index 5dec5fa29d113..cd92464110eeb 100644
--- a/unified-runtime/source/adapters/offload/event.cpp
+++ b/unified-runtime/source/adapters/offload/event.cpp
@@ -1,3 +1,13 @@
+//===----------- event.cpp - LLVM Offload Adapter  ------------------------===//
+//
+// Copyright (C) 2025 Intel Corporation
+//
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
+// Exceptions. See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
 #include <OffloadAPI.h>
 #include <ur_api.h>
 
diff --git a/unified-runtime/source/adapters/offload/event.hpp b/unified-runtime/source/adapters/offload/event.hpp
index 95f692214e6f1..16e0dc649d2ef 100644
--- a/unified-runtime/source/adapters/offload/event.hpp
+++ b/unified-runtime/source/adapters/offload/event.hpp
@@ -1,10 +1,20 @@
+//===----------- event.hpp - LLVM Offload Adapter  ------------------------===//
+//
+// Copyright (C) 2025 Intel Corporation
+//
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
+// Exceptions. See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
 #pragma once
 
-#include <ur_api.h>
 #include <OffloadAPI.h>
+#include <ur_api.h>
 
 #include "common.hpp"
 
 struct ur_event_handle_t_ : RefCounted {
- ol_event_handle_t OffloadEvent;
+  ol_event_handle_t OffloadEvent;
 };
diff --git a/unified-runtime/source/adapters/offload/kernel.cpp b/unified-runtime/source/adapters/offload/kernel.cpp
index 6ab95aa6640da..9195bec1f72fc 100644
--- a/unified-runtime/source/adapters/offload/kernel.cpp
+++ b/unified-runtime/source/adapters/offload/kernel.cpp
@@ -1,3 +1,13 @@
+//===----------- kernel.cpp - LLVM Offload Adapter  -----------------------===//
+//
+// Copyright (C) 2025 Intel Corporation
+//
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
+// Exceptions. See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
 #include "kernel.hpp"
 #include "program.hpp"
 #include "ur2offload.hpp"
@@ -56,10 +66,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelSetArgValue(
   return UR_RESULT_SUCCESS;
 }
 
-UR_APIEXPORT ur_result_t UR_APICALL
-urKernelGetGroupInfo(ur_kernel_handle_t, ur_device_handle_t,
-                     ur_kernel_group_info_t propName, size_t propSize,
-                     void *pPropValue, size_t *pPropSizeRet) {
+UR_APIEXPORT ur_result_t UR_APICALL urKernelGetGroupInfo(
+    ur_kernel_handle_t, ur_device_handle_t, ur_kernel_group_info_t propName,
+    size_t propSize, void *pPropValue, size_t *pPropSizeRet) {
   UrReturnHelper ReturnValue(propSize, pPropValue, pPropSizeRet);
 
   if (propName == UR_KERNEL_GROUP_INFO_COMPILE_WORK_GROUP_SIZE) {
diff --git a/unified-runtime/source/adapters/offload/kernel.hpp b/unified-runtime/source/adapters/offload/kernel.hpp
index dee293aaa1b44..dea7e25d9da9e 100644
--- a/unified-runtime/source/adapters/offload/kernel.hpp
+++ b/unified-runtime/source/adapters/offload/kernel.hpp
@@ -1,3 +1,13 @@
+//===----------- kernel.hpp - LLVM Offload Adapter  -----------------------===//
+//
+// Copyright (C) 2025 Intel Corporation
+//
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
+// Exceptions. See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
 #pragma once
 
 #include <OffloadAPI.h>
diff --git a/unified-runtime/source/adapters/offload/platform.cpp b/unified-runtime/source/adapters/offload/platform.cpp
index 133a653baa7ca..da18fef81d360 100644
--- a/unified-runtime/source/adapters/offload/platform.cpp
+++ b/unified-runtime/source/adapters/offload/platform.cpp
@@ -1,3 +1,13 @@
+//===----------- platform.cpp - LLVM Offload Adapter  ---------------------===//
+//
+// Copyright (C) 2025 Intel Corporation
+//
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
+// Exceptions. See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
 #include <OffloadAPI.h>
 #include <unordered_set>
 #include <ur/ur.hpp>
diff --git a/unified-runtime/source/adapters/offload/platform.hpp b/unified-runtime/source/adapters/offload/platform.hpp
index 100e103998364..82976e56f0508 100644
--- a/unified-runtime/source/adapters/offload/platform.hpp
+++ b/unified-runtime/source/adapters/offload/platform.hpp
@@ -1,12 +1,23 @@
+//===----------- platform.hpp - LLVM Offload Adapter  ---------------------===//
+//
+// Copyright (C) 2025 Intel Corporation
+//
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
+// Exceptions. See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
 #pragma once
 
 #include "common.hpp"
-#include <ur_api.h>
 #include <OffloadAPI.h>
+#include <ur_api.h>
 #include <vector>
 
 struct ur_platform_handle_t_ : ur::offload::handle_base {
-  ur_platform_handle_t_(ol_platform_handle_t OffloadPlatform) : handle_base(), OffloadPlatform(OffloadPlatform) {};
+  ur_platform_handle_t_(ol_platform_handle_t OffloadPlatform)
+      : handle_base(), OffloadPlatform(OffloadPlatform) {};
 
   ol_platform_handle_t OffloadPlatform;
   std::vector<ur_device_handle_t_> Devices;
diff --git a/unified-runtime/source/adapters/offload/program.cpp b/unified-runtime/source/adapters/offload/program.cpp
index 8489b3c5b0a7e..c35b563c24822 100644
--- a/unified-runtime/source/adapters/offload/program.cpp
+++ b/unified-runtime/source/adapters/offload/program.cpp
@@ -1,3 +1,13 @@
+//===----------- program.cpp - LLVM Offload Adapter  ----------------------===//
+//
+// Copyright (C) 2025 Intel Corporation
+//
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
+// Exceptions. See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
 #include <OffloadAPI.h>
 #include <ur/ur.hpp>
 #include <ur_api.h>
diff --git a/unified-runtime/source/adapters/offload/program.hpp b/unified-runtime/source/adapters/offload/program.hpp
index 0639ab336c5fb..1d0263aad2998 100644
--- a/unified-runtime/source/adapters/offload/program.hpp
+++ b/unified-runtime/source/adapters/offload/program.hpp
@@ -1,10 +1,20 @@
+//===----------- program.hpp - LLVM Offload Adapter  ----------------------===//
+//
+// Copyright (C) 2025 Intel Corporation
+//
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
+// Exceptions. See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
 #pragma once
 
-#include <ur_api.h>
 #include <OffloadAPI.h>
+#include <ur_api.h>
 
 #include "common.hpp"
 
 struct ur_program_handle_t_ : RefCounted {
- ol_program_handle_t OffloadProgram;
+  ol_program_handle_t OffloadProgram;
 };
diff --git a/unified-runtime/source/adapters/offload/queue.cpp b/unified-runtime/source/adapters/offload/queue.cpp
index 32adb1f512c11..7ddb9b35c0ffa 100644
--- a/unified-runtime/source/adapters/offload/queue.cpp
+++ b/unified-runtime/source/adapters/offload/queue.cpp
@@ -1,3 +1,13 @@
+//===----------- queue.cpp - LLVM Offload Adapter  ------------------------===//
+//
+// Copyright (C) 2025 Intel Corporation
+//
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
+// Exceptions. See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
 #include <OffloadAPI.h>
 #include <ur/ur.hpp>
 #include <ur_api.h>
diff --git a/unified-runtime/source/adapters/offload/queue.hpp b/unified-runtime/source/adapters/offload/queue.hpp
index 9406d460b7401..6afe4bf15098e 100644
--- a/unified-runtime/source/adapters/offload/queue.hpp
+++ b/unified-runtime/source/adapters/offload/queue.hpp
@@ -1,11 +1,21 @@
+//===----------- queue.hpp - LLVM Offload Adapter  ------------------------===//
+//
+// Copyright (C) 2025 Intel Corporation
+//
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
+// Exceptions. See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
 #pragma once
 
-#include <ur_api.h>
 #include <OffloadAPI.h>
+#include <ur_api.h>
 
 #include "common.hpp"
 
 struct ur_queue_handle_t_ : RefCounted {
- ol_queue_handle_t OffloadQueue;
- ol_device_handle_t OffloadDevice;
+  ol_queue_handle_t OffloadQueue;
+  ol_device_handle_t OffloadDevice;
 };
diff --git a/unified-runtime/source/adapters/offload/usm.cpp b/unified-runtime/source/adapters/offload/usm.cpp
index f603516811f0a..497e454885f06 100644
--- a/unified-runtime/source/adapters/offload/usm.cpp
+++ b/unified-runtime/source/adapters/offload/usm.cpp
@@ -1,3 +1,13 @@
+//===----------- usm.cpp - LLVM Offload Adapter  --------------------------===//
+//
+// Copyright (C) 2025 Intel Corporation
+//
+// Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM
+// Exceptions. See LICENSE.TXT
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
 #include <OffloadAPI.h>
 #include <ur/ur.hpp>
 #include <ur_api.h>
diff --git a/unified-runtime/source/loader/ur_lib.cpp b/unified-runtime/source/loader/ur_lib.cpp
index c224ca00b1777..ca7c8f1bacc8f 100644
--- a/unified-runtime/source/loader/ur_lib.cpp
+++ b/unified-runtime/source/loader/ur_lib.cpp
@@ -259,7 +259,7 @@ ur_result_t urDeviceGetSelected(ur_platform_handle_t hPlatform,
       {UR_PLATFORM_BACKEND_HIP, "hip"},
       {UR_PLATFORM_BACKEND_NATIVE_CPU, "native_cpu"},
       {UR_PLATFORM_BACKEND_OFFLOAD, "offload"},
-    };
+  };
 
   if (!hPlatform) {
     return UR_RESULT_ERROR_INVALID_NULL_HANDLE;

From 626701db084e7a61ea50e283adb43b7ff0780016 Mon Sep 17 00:00:00 2001
From: Callum Fare <callum@codeplay.com>
Date: Thu, 1 May 2025 12:19:43 +0100
Subject: [PATCH 10/11] Address review feedback

---
 unified-runtime/CMakeLists.txt                         | 1 +
 unified-runtime/source/adapters/offload/.clang-format  | 4 ----
 unified-runtime/source/adapters/offload/CMakeLists.txt | 7 ++++---
 3 files changed, 5 insertions(+), 7 deletions(-)
 delete mode 100644 unified-runtime/source/adapters/offload/.clang-format

diff --git a/unified-runtime/CMakeLists.txt b/unified-runtime/CMakeLists.txt
index 7d1ee861b1879..0e051b402c692 100644
--- a/unified-runtime/CMakeLists.txt
+++ b/unified-runtime/CMakeLists.txt
@@ -47,6 +47,7 @@ option(UR_BUILD_ADAPTER_HIP "Build the HIP adapter" OFF)
 option(UR_BUILD_ADAPTER_NATIVE_CPU "Build the Native-CPU adapter" OFF)
 option(UR_BUILD_ADAPTER_ALL "Build all currently supported adapters" OFF)
 option(UR_BUILD_ADAPTER_L0_V2 "Build the (experimental) Level-Zero v2 adapter" OFF)
+option(UR_BUILD_ADAPTER_OFFLOAD "Build the experimental Offload adapter" OFF)
 option(UR_STATIC_ADAPTER_L0 "Build the Level-Zero adapter as static and embed in the loader" OFF)
 option(UR_BUILD_EXAMPLE_CODEGEN "Build the codegen example." OFF)
 option(VAL_USE_LIBBACKTRACE_BACKTRACE "enable libbacktrace validation backtrace for linux" OFF)
diff --git a/unified-runtime/source/adapters/offload/.clang-format b/unified-runtime/source/adapters/offload/.clang-format
deleted file mode 100644
index c8daebc205b34..0000000000000
--- a/unified-runtime/source/adapters/offload/.clang-format
+++ /dev/null
@@ -1,4 +0,0 @@
----
-Language: Cpp
-BasedOnStyle: LLVM
-...
diff --git a/unified-runtime/source/adapters/offload/CMakeLists.txt b/unified-runtime/source/adapters/offload/CMakeLists.txt
index dfcafc0fa98e5..c68b9aba9c623 100644
--- a/unified-runtime/source/adapters/offload/CMakeLists.txt
+++ b/unified-runtime/source/adapters/offload/CMakeLists.txt
@@ -3,8 +3,6 @@
 # See LICENSE.TXT
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
-set(UR_OFFLOAD_ADAPTER_DIR "${CMAKE_CURRENT_SOURCE_DIR}" CACHE PATH "Offload adapter directory") # TODO
-
 set(TARGET_NAME ur_adapter_offload)
 
 set(UR_OFFLOAD_INSTALL_DIR "" CACHE PATH "Path to the directory containing libomptarget.so etc")
@@ -17,7 +15,10 @@ if (UR_OFFLOAD_INCLUDE_DIR STREQUAL "")
     message(FATAL_ERROR "UR_OFFLOAD_INCLUDE_DIR must be defined for the Offload adapter")
 endif()
 
-# For the PTX workaround we need to link with CUDA.
+# When targetting CUDA devices, we need a workaround to avoid sending PTX to
+# liboffload as the CUDA plugin doesn't support it yet. The workaround is to
+# simply always link the incoming program so it ends up as CUBIN. Try to find
+# the cuda driver so we can enable this where possible.
 if (NOT TARGET cudadrv)
         find_package(CUDA 10.1)
         add_library(cudadrv SHARED IMPORTED GLOBAL)

From b42db9e867ba3581dfdfdb196eb4babce8a65469 Mon Sep 17 00:00:00 2001
From: Callum Fare <callum@codeplay.com>
Date: Thu, 1 May 2025 13:51:46 +0100
Subject: [PATCH 11/11] Remove unneeded struct in olIterateDevices call

---
 .../source/adapters/offload/adapter.cpp       | 26 +++++++++----------
 1 file changed, 12 insertions(+), 14 deletions(-)

diff --git a/unified-runtime/source/adapters/offload/adapter.cpp b/unified-runtime/source/adapters/offload/adapter.cpp
index fd3608dcacb0c..9ee8ec38fa4cd 100644
--- a/unified-runtime/source/adapters/offload/adapter.cpp
+++ b/unified-runtime/source/adapters/offload/adapter.cpp
@@ -25,14 +25,11 @@ ur_adapter_handle_t_ Adapter{};
 ur_result_t ur_adapter_handle_t_::init() {
   auto Res = olInit();
 
-  struct InitUserData {
-    std::unordered_map<ol_platform_handle_t, ur_platform_handle_t> TempMap;
-  } InitUserData{{}};
-
   // Discover every platform and device
   Res = olIterateDevices(
       [](ol_device_handle_t D, void *UserData) {
-        auto *Data = reinterpret_cast<decltype(InitUserData) *>(UserData);
+        auto *Platforms =
+            reinterpret_cast<decltype(Adapter.Platforms) *>(UserData);
 
         ol_platform_handle_t Platform;
         olGetDeviceInfo(D, OL_DEVICE_INFO_PLATFORM, sizeof(Platform),
@@ -43,20 +40,21 @@ ur_result_t ur_adapter_handle_t_::init() {
         if (Backend == OL_PLATFORM_BACKEND_HOST) {
           Adapter.HostDevice = D;
         } else if (Backend != OL_PLATFORM_BACKEND_UNKNOWN) {
-          ur_platform_handle_t UrPlatform;
-          if (!Data->TempMap.count(Platform)) {
-            Adapter.Platforms.push_back(ur_platform_handle_t_{Platform});
-            UrPlatform = &Adapter.Platforms.back();
-            Data->TempMap.insert({Platform, UrPlatform});
-          } else {
-            UrPlatform = Data->TempMap[Platform];
+          auto URPlatform =
+              std::find_if(Platforms->begin(), Platforms->end(), [&](auto &P) {
+                return P.OffloadPlatform == Platform;
+              });
+
+          if (URPlatform == Platforms->end()) {
+            URPlatform =
+                Platforms->insert(URPlatform, ur_platform_handle_t_(Platform));
           }
 
-          UrPlatform->Devices.push_back(ur_device_handle_t_{UrPlatform, D});
+          URPlatform->Devices.push_back(ur_device_handle_t_{&*URPlatform, D});
         }
         return false;
       },
-      &InitUserData);
+      &Adapter.Platforms);
 
   (void)Res;