From b68a43dd7f97d8ff104183a92285f8790372c4c2 Mon Sep 17 00:00:00 2001 From: Tom J Nowell Date: Sat, 20 Jun 2026 13:20:45 +0100 Subject: [PATCH 1/2] macOS: build the unitsync target on Apple Silicon Add the platform pieces unitsync needs on macOS: - System/Platform/Mac/CpuTopology.cpp: implements the cpu_topology API on macOS via sysctl. Uses hw.perflevel0/1.physicalcpu to split performance vs efficiency cores on Apple Silicon (falling back to a homogeneous core count on Intel / older kernels), and reports THREAD_PIN_POLICY_NONE since macOS has no affinity pinning (scheduling locality is a QoS hint instead). - tools/unitsync/CMakeLists.txt: an APPLE branch in the per-platform source selection, pulling in the Mac CpuTopology plus the shared Linux Hardware/SharedLib/SoLib sources (the Linux ThreadSupport does not build on macOS). With this, libunitsync.dylib builds as a native arm64 library under Homebrew GCC. Part of the macOS bring-up; depends on the macOS threading and CMake-configure changes in the related PRs to compile and configure cleanly. No effect on Linux/Windows. --- rts/System/Platform/Mac/CpuTopology.cpp | 93 +++++++++++++++++++++++++ tools/unitsync/CMakeLists.txt | 9 ++- 2 files changed, 100 insertions(+), 2 deletions(-) create mode 100644 rts/System/Platform/Mac/CpuTopology.cpp diff --git a/rts/System/Platform/Mac/CpuTopology.cpp b/rts/System/Platform/Mac/CpuTopology.cpp new file mode 100644 index 00000000000..d30469b0bd2 --- /dev/null +++ b/rts/System/Platform/Mac/CpuTopology.cpp @@ -0,0 +1,93 @@ +/* This file is part of the Recoil engine (GPL v2 or later), see LICENSE.html */ + +#include "System/Platform/CpuTopology.h" +#include +#include + +namespace cpu_topology { + +namespace { + +// Read an unsigned sysctl value by name. Returns 0 if the key is unavailable +// (e.g. perflevel keys on Intel Macs or pre-Apple-Silicon kernels). +unsigned int ReadSysctlUInt(const char* name) { + int value = 0; + size_t valueSize = sizeof(value); + if (sysctlbyname(name, &value, &valueSize, nullptr, 0) != 0) + return 0; + return (value > 0) ? static_cast(value) : 0; +} + +unsigned int BitsForCount(unsigned int n) { + if (n == 0) return 0; + if (n >= 32) return 0xFFFFFFFFu; + return (1u << n) - 1u; +} + +} // namespace + +ThreadPinPolicy GetThreadPinPolicy() { + // macOS has no pthread_setaffinity_np equivalent. Scheduling locality is + // instead expressed via QOS classes; see Platform/Mac/ThreadSupport.cpp. + return THREAD_PIN_POLICY_NONE; +} + +ProcessorMasks GetProcessorMasks() { + ProcessorMasks masks{}; + + // Apple Silicon exposes per-perflevel core counts. perflevel0 is the + // high-performance (P) cluster; perflevel1, when present, is the + // efficiency (E) cluster. Intel Macs and older kernels do not expose + // these keys; fall back to treating every core as a P-core there. + const unsigned int numPCores = ReadSysctlUInt("hw.perflevel0.physicalcpu"); + const unsigned int numECores = ReadSysctlUInt("hw.perflevel1.physicalcpu"); + + if (numPCores > 0) { + masks.performanceCoreMask = BitsForCount(numPCores); + // E-cores occupy the bits above the P-cores in the combined mask. + const unsigned int totalCores = numPCores + numECores; + const unsigned int allMask = BitsForCount(totalCores); + masks.efficiencyCoreMask = allMask & ~masks.performanceCoreMask; + } else { + // Intel Mac / unknown topology: treat the visible core count as + // homogeneous P-cores. Matches prior behavior on those targets. + unsigned int numCores = std::thread::hardware_concurrency(); + if (numCores == 0) numCores = 4; + masks.performanceCoreMask = BitsForCount(numCores); + masks.efficiencyCoreMask = 0; + } + + // macOS does not expose SMT/HT details; report all visible cores as + // hyper-thread-low so callers consuming those masks stay consistent. + masks.hyperThreadLowMask = masks.performanceCoreMask | masks.efficiencyCoreMask; + masks.hyperThreadHighMask = 0; + + return masks; +} + +ProcessorCaches GetProcessorCache() { + ProcessorCaches caches; + + ProcessorGroupCaches group; + unsigned int numCores = std::thread::hardware_concurrency(); + if (numCores == 0) numCores = 4; + group.groupMask = BitsForCount(numCores); + + // Try to get cache sizes via sysctl + size_t size = sizeof(uint64_t); + uint64_t cacheSize = 0; + + if (sysctlbyname("hw.l1dcachesize", &cacheSize, &size, nullptr, 0) == 0) + group.cacheSizes[0] = static_cast(cacheSize); + + if (sysctlbyname("hw.l2cachesize", &cacheSize, &size, nullptr, 0) == 0) + group.cacheSizes[1] = static_cast(cacheSize); + + if (sysctlbyname("hw.l3cachesize", &cacheSize, &size, nullptr, 0) == 0) + group.cacheSizes[2] = static_cast(cacheSize); + + caches.groupCaches.push_back(group); + return caches; +} + +} // namespace cpu_topology diff --git a/tools/unitsync/CMakeLists.txt b/tools/unitsync/CMakeLists.txt index 00f124f0a78..f7f5169e539 100644 --- a/tools/unitsync/CMakeLists.txt +++ b/tools/unitsync/CMakeLists.txt @@ -107,11 +107,16 @@ if (WIN32) list(APPEND main_files "${ENGINE_SRC_ROOT}/System/Platform/Win/WinVersion.cpp") list(APPEND main_files "${ENGINE_SRC_ROOT}/System/Platform/SharedLib.cpp") list(APPEND main_files "${ENGINE_SRC_ROOT}/System/Platform/Win/DllLib.cpp") -else (WIN32) +elseif (APPLE) + list(APPEND main_files "${ENGINE_SRC_ROOT}/System/Platform/Mac/CpuTopology.cpp") + list(APPEND main_files "${ENGINE_SRC_ROOT}/System/Platform/Linux/Hardware.cpp") + list(APPEND main_files "${ENGINE_SRC_ROOT}/System/Platform/SharedLib.cpp") + list(APPEND main_files "${ENGINE_SRC_ROOT}/System/Platform/Linux/SoLib.cpp") +else () list(APPEND main_files "${ENGINE_SRC_ROOT}/System/Platform/Linux/CpuTopology.cpp") list(APPEND main_files "${ENGINE_SRC_ROOT}/System/Platform/Linux/Hardware.cpp") list(APPEND main_files "${ENGINE_SRC_ROOT}/System/Platform/Linux/ThreadSupport.cpp") -endif (WIN32) +endif () set(unitsync_files ${sources_engine_System_FileSystem} From 88e5b6ff6d85363ecb2f0574aa2a90d182ed9c0f Mon Sep 17 00:00:00 2001 From: Tom J Nowell Date: Thu, 25 Jun 2026 20:25:39 +0100 Subject: [PATCH 2/2] macOS: extract low-bits mask helper into System/BitUtils.h Move the local BitsForCount helper out of Mac/CpuTopology.cpp into a generic spring::LowBitsMask in System/BitUtils.h, per review feedback. It saturates at the type width to avoid the 1< +#include + +namespace spring { + +// Returns a mask with the low `n` bits set. Saturates at the width of T to +// avoid the undefined behavior of shifting by >= the type's bit width. +template +constexpr T LowBitsMask(unsigned int n) { + if (n == 0) + return T(0); + if (n >= sizeof(T) * CHAR_BIT) + return ~T(0); + return (T(1) << n) - T(1); +} + +} // namespace spring diff --git a/rts/System/Platform/Mac/CpuTopology.cpp b/rts/System/Platform/Mac/CpuTopology.cpp index d30469b0bd2..0f1f6be0a1d 100644 --- a/rts/System/Platform/Mac/CpuTopology.cpp +++ b/rts/System/Platform/Mac/CpuTopology.cpp @@ -1,6 +1,7 @@ /* This file is part of the Recoil engine (GPL v2 or later), see LICENSE.html */ #include "System/Platform/CpuTopology.h" +#include "System/BitUtils.h" #include #include @@ -18,12 +19,6 @@ unsigned int ReadSysctlUInt(const char* name) { return (value > 0) ? static_cast(value) : 0; } -unsigned int BitsForCount(unsigned int n) { - if (n == 0) return 0; - if (n >= 32) return 0xFFFFFFFFu; - return (1u << n) - 1u; -} - } // namespace ThreadPinPolicy GetThreadPinPolicy() { @@ -43,17 +38,17 @@ ProcessorMasks GetProcessorMasks() { const unsigned int numECores = ReadSysctlUInt("hw.perflevel1.physicalcpu"); if (numPCores > 0) { - masks.performanceCoreMask = BitsForCount(numPCores); + masks.performanceCoreMask = spring::LowBitsMask(numPCores); // E-cores occupy the bits above the P-cores in the combined mask. const unsigned int totalCores = numPCores + numECores; - const unsigned int allMask = BitsForCount(totalCores); + const unsigned int allMask = spring::LowBitsMask(totalCores); masks.efficiencyCoreMask = allMask & ~masks.performanceCoreMask; } else { // Intel Mac / unknown topology: treat the visible core count as // homogeneous P-cores. Matches prior behavior on those targets. unsigned int numCores = std::thread::hardware_concurrency(); if (numCores == 0) numCores = 4; - masks.performanceCoreMask = BitsForCount(numCores); + masks.performanceCoreMask = spring::LowBitsMask(numCores); masks.efficiencyCoreMask = 0; } @@ -71,7 +66,7 @@ ProcessorCaches GetProcessorCache() { ProcessorGroupCaches group; unsigned int numCores = std::thread::hardware_concurrency(); if (numCores == 0) numCores = 4; - group.groupMask = BitsForCount(numCores); + group.groupMask = spring::LowBitsMask(numCores); // Try to get cache sizes via sysctl size_t size = sizeof(uint64_t);