From b68a43dd7f97d8ff104183a92285f8790372c4c2 Mon Sep 17 00:00:00 2001
From: Tom J Nowell <contact@tomjn.com>
Date: Sat, 20 Jun 2026 13:20:45 +0100
Subject: [PATCH 1/2] macOS: build the unitsync target on Apple Silicon

Add the platform pieces unitsync needs on macOS:

- System/Platform/Mac/CpuTopology.cpp: implements the cpu_topology API
  on macOS via sysctl. Uses hw.perflevel0/1.physicalcpu to split
  performance vs efficiency cores on Apple Silicon (falling back to a
  homogeneous core count on Intel / older kernels), and reports
  THREAD_PIN_POLICY_NONE since macOS has no affinity pinning (scheduling
  locality is a QoS hint instead).

- tools/unitsync/CMakeLists.txt: an APPLE branch in the per-platform
  source selection, pulling in the Mac CpuTopology plus the shared
  Linux Hardware/SharedLib/SoLib sources (the Linux ThreadSupport does
  not build on macOS).

With this, libunitsync.dylib builds as a native arm64 library under
Homebrew GCC. Part of the macOS bring-up; depends on the macOS
threading and CMake-configure changes in the related PRs to compile and
configure cleanly. No effect on Linux/Windows.
---
 rts/System/Platform/Mac/CpuTopology.cpp | 93 +++++++++++++++++++++++++
 tools/unitsync/CMakeLists.txt           |  9 ++-
 2 files changed, 100 insertions(+), 2 deletions(-)
 create mode 100644 rts/System/Platform/Mac/CpuTopology.cpp
diff --git a/rts/System/Platform/Mac/CpuTopology.cpp b/rts/System/Platform/Mac/CpuTopology.cpp
new file mode 100644
index 00000000000..d30469b0bd2
--- /dev/null
+++ b/rts/System/Platform/Mac/CpuTopology.cpp
@@ -0,0 +1,93 @@
+/* This file is part of the Recoil engine (GPL v2 or later), see LICENSE.html */
+
+#include "System/Platform/CpuTopology.h"
+#include <sys/sysctl.h>
+#include <thread>
+
+namespace cpu_topology {
+
+namespace {
+
+// Read an unsigned sysctl value by name. Returns 0 if the key is unavailable
+// (e.g. perflevel keys on Intel Macs or pre-Apple-Silicon kernels).
+unsigned int ReadSysctlUInt(const char* name) {
+	int value = 0;
+	size_t valueSize = sizeof(value);
+	if (sysctlbyname(name, &value, &valueSize, nullptr, 0) != 0)
+		return 0;
+	return (value > 0) ? static_cast<unsigned int>(value) : 0;
+}
+
+unsigned int BitsForCount(unsigned int n) {
+	if (n == 0) return 0;
+	if (n >= 32) return 0xFFFFFFFFu;
+	return (1u << n) - 1u;
+}
+
+} // namespace
+
+ThreadPinPolicy GetThreadPinPolicy() {
+	// macOS has no pthread_setaffinity_np equivalent. Scheduling locality is
+	// instead expressed via QOS classes; see Platform/Mac/ThreadSupport.cpp.
+	return THREAD_PIN_POLICY_NONE;
+}
+
+ProcessorMasks GetProcessorMasks() {
+	ProcessorMasks masks{};
+
+	// Apple Silicon exposes per-perflevel core counts. perflevel0 is the
+	// high-performance (P) cluster; perflevel1, when present, is the
+	// efficiency (E) cluster. Intel Macs and older kernels do not expose
+	// these keys; fall back to treating every core as a P-core there.
+	const unsigned int numPCores = ReadSysctlUInt("hw.perflevel0.physicalcpu");
+	const unsigned int numECores = ReadSysctlUInt("hw.perflevel1.physicalcpu");
+
+	if (numPCores > 0) {
+		masks.performanceCoreMask = BitsForCount(numPCores);
+		// E-cores occupy the bits above the P-cores in the combined mask.
+		const unsigned int totalCores = numPCores + numECores;
+		const unsigned int allMask = BitsForCount(totalCores);
+		masks.efficiencyCoreMask = allMask & ~masks.performanceCoreMask;
+	} else {
+		// Intel Mac / unknown topology: treat the visible core count as
+		// homogeneous P-cores. Matches prior behavior on those targets.
+		unsigned int numCores = std::thread::hardware_concurrency();
+		if (numCores == 0) numCores = 4;
+		masks.performanceCoreMask = BitsForCount(numCores);
+		masks.efficiencyCoreMask = 0;
+	}
+
+	// macOS does not expose SMT/HT details; report all visible cores as
+	// hyper-thread-low so callers consuming those masks stay consistent.
+	masks.hyperThreadLowMask = masks.performanceCoreMask | masks.efficiencyCoreMask;
+	masks.hyperThreadHighMask = 0;
+
+	return masks;
+}
+
+ProcessorCaches GetProcessorCache() {
+	ProcessorCaches caches;
+
+	ProcessorGroupCaches group;
+	unsigned int numCores = std::thread::hardware_concurrency();
+	if (numCores == 0) numCores = 4;
+	group.groupMask = BitsForCount(numCores);
+
+	// Try to get cache sizes via sysctl
+	size_t size = sizeof(uint64_t);
+	uint64_t cacheSize = 0;
+
+	if (sysctlbyname("hw.l1dcachesize", &cacheSize, &size, nullptr, 0) == 0)
+		group.cacheSizes[0] = static_cast<uint32_t>(cacheSize);
+
+	if (sysctlbyname("hw.l2cachesize", &cacheSize, &size, nullptr, 0) == 0)
+		group.cacheSizes[1] = static_cast<uint32_t>(cacheSize);
+
+	if (sysctlbyname("hw.l3cachesize", &cacheSize, &size, nullptr, 0) == 0)
+		group.cacheSizes[2] = static_cast<uint32_t>(cacheSize);
+
+	caches.groupCaches.push_back(group);
+	return caches;
+}
+
+} // namespace cpu_topology
diff --git a/tools/unitsync/CMakeLists.txt b/tools/unitsync/CMakeLists.txt
index 00f124f0a78..f7f5169e539 100644
--- a/tools/unitsync/CMakeLists.txt
+++ b/tools/unitsync/CMakeLists.txt
@@ -107,11 +107,16 @@ if (WIN32)
 	list(APPEND main_files "${ENGINE_SRC_ROOT}/System/Platform/Win/WinVersion.cpp")
 	list(APPEND main_files "${ENGINE_SRC_ROOT}/System/Platform/SharedLib.cpp")
 	list(APPEND main_files "${ENGINE_SRC_ROOT}/System/Platform/Win/DllLib.cpp")
-else (WIN32)
+elseif (APPLE)
+	list(APPEND main_files "${ENGINE_SRC_ROOT}/System/Platform/Mac/CpuTopology.cpp")
+	list(APPEND main_files "${ENGINE_SRC_ROOT}/System/Platform/Linux/Hardware.cpp")
+	list(APPEND main_files "${ENGINE_SRC_ROOT}/System/Platform/SharedLib.cpp")
+	list(APPEND main_files "${ENGINE_SRC_ROOT}/System/Platform/Linux/SoLib.cpp")
+else ()
 	list(APPEND main_files "${ENGINE_SRC_ROOT}/System/Platform/Linux/CpuTopology.cpp")
 	list(APPEND main_files "${ENGINE_SRC_ROOT}/System/Platform/Linux/Hardware.cpp")
 	list(APPEND main_files "${ENGINE_SRC_ROOT}/System/Platform/Linux/ThreadSupport.cpp")
-endif (WIN32)
+endif ()
 
 set(unitsync_files
 	${sources_engine_System_FileSystem}

From 88e5b6ff6d85363ecb2f0574aa2a90d182ed9c0f Mon Sep 17 00:00:00 2001
From: Tom J Nowell <contact@tomjn.com>
Date: Thu, 25 Jun 2026 20:25:39 +0100
Subject: [PATCH 2/2] macOS: extract low-bits mask helper into
 System/BitUtils.h

Move the local BitsForCount helper out of Mac/CpuTopology.cpp into a
generic spring::LowBitsMask in System/BitUtils.h, per review feedback.
It saturates at the type width to avoid the 1<<width shift UB.
---
 rts/System/BitUtils.h                   | 21 +++++++++++++++++++++
 rts/System/Platform/Mac/CpuTopology.cpp | 15 +++++----------
 2 files changed, 26 insertions(+), 10 deletions(-)
 create mode 100644 rts/System/BitUtils.h

diff --git a/rts/System/BitUtils.h b/rts/System/BitUtils.h
new file mode 100644
index 00000000000..4bb70b822c5
--- /dev/null
+++ b/rts/System/BitUtils.h
@@ -0,0 +1,21 @@
+/* This file is part of the Recoil engine (GPL v2 or later), see LICENSE.html */
+
+#pragma once
+
+#include <cstdint>
+#include <climits>
+
+namespace spring {
+
+// Returns a mask with the low `n` bits set. Saturates at the width of T to
+// avoid the undefined behavior of shifting by >= the type's bit width.
+template<class T = uint32_t>
+constexpr T LowBitsMask(unsigned int n) {
+	if (n == 0)
+		return T(0);
+	if (n >= sizeof(T) * CHAR_BIT)
+		return ~T(0);
+	return (T(1) << n) - T(1);
+}
+
+} // namespace spring
diff --git a/rts/System/Platform/Mac/CpuTopology.cpp b/rts/System/Platform/Mac/CpuTopology.cpp
index d30469b0bd2..0f1f6be0a1d 100644
--- a/rts/System/Platform/Mac/CpuTopology.cpp
+++ b/rts/System/Platform/Mac/CpuTopology.cpp
@@ -1,6 +1,7 @@
 /* This file is part of the Recoil engine (GPL v2 or later), see LICENSE.html */
 
 #include "System/Platform/CpuTopology.h"
+#include "System/BitUtils.h"
 #include <sys/sysctl.h>
 #include <thread>
 
@@ -18,12 +19,6 @@ unsigned int ReadSysctlUInt(const char* name) {
 	return (value > 0) ? static_cast<unsigned int>(value) : 0;
 }
 
-unsigned int BitsForCount(unsigned int n) {
-	if (n == 0) return 0;
-	if (n >= 32) return 0xFFFFFFFFu;
-	return (1u << n) - 1u;
-}
-
 } // namespace
 
 ThreadPinPolicy GetThreadPinPolicy() {
@@ -43,17 +38,17 @@ ProcessorMasks GetProcessorMasks() {
 	const unsigned int numECores = ReadSysctlUInt("hw.perflevel1.physicalcpu");
 
 	if (numPCores > 0) {
-		masks.performanceCoreMask = BitsForCount(numPCores);
+		masks.performanceCoreMask = spring::LowBitsMask(numPCores);
 		// E-cores occupy the bits above the P-cores in the combined mask.
 		const unsigned int totalCores = numPCores + numECores;
-		const unsigned int allMask = BitsForCount(totalCores);
+		const unsigned int allMask = spring::LowBitsMask(totalCores);
 		masks.efficiencyCoreMask = allMask & ~masks.performanceCoreMask;
 	} else {
 		// Intel Mac / unknown topology: treat the visible core count as
 		// homogeneous P-cores. Matches prior behavior on those targets.
 		unsigned int numCores = std::thread::hardware_concurrency();
 		if (numCores == 0) numCores = 4;
-		masks.performanceCoreMask = BitsForCount(numCores);
+		masks.performanceCoreMask = spring::LowBitsMask(numCores);
 		masks.efficiencyCoreMask = 0;
 	}
 
@@ -71,7 +66,7 @@ ProcessorCaches GetProcessorCache() {
 	ProcessorGroupCaches group;
 	unsigned int numCores = std::thread::hardware_concurrency();
 	if (numCores == 0) numCores = 4;
-	group.groupMask = BitsForCount(numCores);
+	group.groupMask = spring::LowBitsMask(numCores);
 
 	// Try to get cache sizes via sysctl
 	size_t size = sizeof(uint64_t);