Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for Windows ARM64 #2089

Merged
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 11 additions & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -180,7 +180,12 @@ option(OCIO_USE_OIIO_FOR_APPS "Request OIIO to build apps (ociolutimage, ociocon


if (NOT APPLE)
if ("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "(AMD64|IA64|EM64T|x86_64|X86|i386|i686)")
if("${CMAKE_GENERATOR_PLATFORM}" MATCHES "(ARM64|arm64)" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "ARM64")
set(OCIO_ARCH_X86 0)
set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_SSE ON)
set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_AVX OFF)
set(OCIO_BUILD_ENABLE_OPTIMIZATIONS_F16C OFF)
elseif ("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "(AMD64|IA64|EM64T|x86_64|X86|i386|i686)")
# Intel-based architecture (not APPLE)
if ("${CMAKE_SYSTEM_PROCESSOR}" MATCHES "(X86|i386|i686)")
set(OCIO_ARCH_X86_32 1)
Expand Down Expand Up @@ -270,7 +275,7 @@ option(OCIO_USE_AVX2 "Specify whether to enable AVX2 CPU performance optimizatio
option(OCIO_USE_AVX512 "Specify whether to enable AVX512 CPU performance optimizations" ${OCIO_BUILD_ENABLE_OPTIMIZATIONS_AVX})
option(OCIO_USE_F16C "Specify whether to enable F16C CPU performance optimizations" ${OCIO_BUILD_ENABLE_OPTIMIZATIONS_F16C})

if (APPLE)
if (APPLE OR WIN32)
# TODO: Revisit whether that option is necessary.
option(OCIO_USE_SSE2NEON "Specify whether to enable SSE CPU performance optimizations using SSE2NEON for Apple ARM architecture" ON)
mark_as_advanced(OCIO_USE_SSE2NEON)
Expand Down Expand Up @@ -332,8 +337,10 @@ if(OCIO_USE_SIMD AND OCIO_USE_SSE2NEON AND COMPILER_SUPPORTS_ARM_NEON)
add_library(sse2neon INTERFACE)
# Add the include directories to the target.
target_include_directories(sse2neon INTERFACE "${sse2neon_INCLUDE_DIR}")
# Ignore the warnings coming from sse2neon.h as they are false positives.
target_compile_options(sse2neon INTERFACE -Wno-unused-parameter)
if(NOT MSVC)
# Ignore the warnings coming from sse2neon.h as they are false positives.
target_compile_options(sse2neon INTERFACE -Wno-unused-parameter)
endif()
endif()
endif()

Expand Down
8 changes: 5 additions & 3 deletions share/cmake/modules/install/Installsse2neon.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ include(FetchContent)
set(FETCHCONTENT_BASE_DIR "${CMAKE_BINARY_DIR}/ext/build/sse2neon")
FetchContent_Declare(sse2neon
GIT_REPOSITORY https://github.com/DLTcollab/sse2neon.git
GIT_TAG v1.6.0
GIT_TAG 227cc413fb2d50b2a10073087be96b59d5364aea
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Any reason you couldn't pick an official version number here?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Long story short, we uncovered some issues with 1.7.0 in Blender on various platforms, and needed to update to a non-versioned commit, as they only version sse2neon once a year - the issues may or may not affect OCIO, but this is at least a known working and tested version that Blender uses.

)

# FetchContent_MakeAvailable is not available until CMake 3.14+.
Expand All @@ -38,6 +38,8 @@ if(NOT sse2neon_POPULATED)
add_library(sse2neon INTERFACE)
# Add the include directories to the target.
target_include_directories(sse2neon INTERFACE "${sse2neon_INCLUDE_DIR}")
# Ignore the warnings coming from sse2neon.h as they are false positives.
target_compile_options(sse2neon INTERFACE -Wno-unused-parameter)
if(NOT MSVC)
# Ignore the warnings coming from sse2neon.h as they are false positives.
target_compile_options(sse2neon INTERFACE -Wno-unused-parameter)
endif()
endif()
10 changes: 8 additions & 2 deletions share/cmake/utils/CheckSupportSSEUsingSSE2NEON.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,13 @@ include(CheckCXXSourceCompiles)
set(_cmake_required_flags_orig "${CMAKE_REQUIRED_FLAGS}")
set(_cmake_required_includes_orig "${CMAKE_REQUIRED_INCLUDES}")
set(_cmake_osx_architectures_orig "${CMAKE_OSX_ARCHITECTURES}")
set(_cmake_cxx_flags_orig "${CMAKE_CXX_FLAGS}")

if(APPLE AND COMPILER_SUPPORTS_ARM_NEON)
if(MSVC)
set(CMAKE_CXX_FLAGS "/Zc:preprocessor")
endif()

if((APPLE OR WIN32) AND COMPILER_SUPPORTS_ARM_NEON)

if("${CMAKE_OSX_ARCHITECTURES}" MATCHES "arm64;x86_64" OR
"${CMAKE_OSX_ARCHITECTURES}" MATCHES "x86_64;arm64")
Expand Down Expand Up @@ -63,8 +68,9 @@ endif()
set(CMAKE_REQUIRED_FLAGS "${_cmake_required_flags_orig}")
set(CMAKE_REQUIRED_INCLUDES "${_cmake_required_includes_orig}")
set(CMAKE_OSX_ARCHITECTURES "${_cmake_osx_architectures_orig}")
set(CMAKE_CXX_FLAGS "${_cmake_cxx_flags_orig}")

unset(_cmake_required_flags_orig)
unset(_cmake_required_includes_orig)
unset(_cmake_osx_architectures_orig)

unset(_cmake_cxx_flags_orig)
7 changes: 6 additions & 1 deletion share/cmake/utils/CompilerFlags.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,12 @@ if(OCIO_USE_SIMD)

if (OCIO_USE_SSE2NEON AND COMPILER_SUPPORTS_ARM_NEON)
include(CheckSupportSSEUsingSSE2NEON)
if(NOT COMPILER_SUPPORTS_SSE_WITH_SSE2NEON)
if(COMPILER_SUPPORTS_SSE_WITH_SSE2NEON)
if(WIN32)
anthony-linaro marked this conversation as resolved.
Show resolved Hide resolved
# Enable the "new" preprocessor, to more closely match Clang/GCC, required for sse2neon
set(PLATFORM_COMPILE_OPTIONS "${PLATFORM_COMPILE_OPTIONS};/Zc:preprocessor")
endif()
else()
set(OCIO_USE_SSE2NEON OFF)
endif()
endif()
Expand Down
2 changes: 1 addition & 1 deletion src/OpenColorIO/CPUInfo.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ CPUInfo::CPUInfo()
}
}

#elif defined(__aarch64__) // ARM Processor or Apple ARM.
#elif defined(__aarch64__) || defined(_M_ARM64) // ARM Processor or Apple ARM.
anthony-linaro marked this conversation as resolved.
Show resolved Hide resolved

CPUInfo::CPUInfo()
{
Expand Down
4 changes: 2 additions & 2 deletions src/OpenColorIO/CPUInfoConfig.h.in
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
#cmakedefine01 OCIO_ARCH_X86_32

// Relevant only for arm64 architecture.
#if defined(__aarch64__)
#if defined(__aarch64__) || defined(_M_ARM64)
#cmakedefine01 OCIO_USE_SSE2NEON
#else
#define OCIO_USE_SSE2NEON 0
Expand All @@ -23,7 +23,7 @@

// Building for x86_64 processor on a non-ARM host architecture
// OR Building on/for an ARM architecture and using SSE2NEON.
#if (OCIO_ARCH_X86 && !defined(__aarch64__)) || (defined(__aarch64__) && OCIO_USE_SSE2NEON)
#if (OCIO_ARCH_X86 && !defined(__aarch64__)) || ((defined(__aarch64__) || defined(_M_ARM64)) && OCIO_USE_SSE2NEON)
#cmakedefine01 OCIO_USE_SSE2
#cmakedefine01 OCIO_USE_SSE3
#cmakedefine01 OCIO_USE_SSSE3
Expand Down
23 changes: 19 additions & 4 deletions src/OpenColorIO/SSE.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,25 @@
#if OCIO_USE_SSE2

// Include the appropriate SIMD intrinsics header based on the architecture (Intel vs. ARM).
#if !defined(__aarch64__)
#if !defined(__aarch64__) && !defined(_M_ARM64)
#if OCIO_USE_SSE2
#include <emmintrin.h>
#endif
#elif defined(__aarch64__)
#elif defined(__aarch64__) || defined(_M_ARM64)
// ARM architecture A64 (ARM64)
#if OCIO_USE_SSE2NEON
// MSVC doesn't like the redefinitions below and requires the existing functions to be undef-ed
#if defined(_M_ARM64)
#define _mm_max_ps _mm_max_ps_orig
#define _mm_min_ps _mm_min_ps_orig
#endif

#include <sse2neon.h>

#if defined(_M_ARM64)
#undef _mm_max_ps
#undef _mm_min_ps
#endif
#endif
#endif

Expand All @@ -30,7 +41,7 @@ namespace OCIO_NAMESPACE
// Note that it is important for the code below this ifdef stays in the OCIO_NAMESPACE since
// it is redefining two of the functions from sse2neon.

#if defined(__aarch64__)
#if defined(__aarch64__) || defined(_M_ARM64)
#if OCIO_USE_SSE2NEON
// Using vmaxnmq_f32 and vminnmq_f32 rather than sse2neon's vmaxq_f32 and vminq_f32 due to
// NaN handling. This doesn't seem to be significantly slower than the default sse2neon behavior.
Expand Down Expand Up @@ -77,6 +88,9 @@ static const __m128 EPOS128 = _mm_set1_ps(128.0f);

static const __m128 EPOSINF = _mm_set1_ps(std::numeric_limits<float>::infinity());

// These funtions won't work when using MSVC + ARM64 unless you specify /Zc:arm64-aliased-neon-types-
// This comes with it's own issues, so it is easier to just disable them when using MSVC + ARM64
#if !defined(_M_ARM64)
// Debug function to print out the contents of a floating-point SSE register
inline void ssePrintRegister(const char* msg, __m128& reg)
{
Expand All @@ -91,6 +105,7 @@ inline void ssePrintRegister(const char* msg, __m128i& reg)
int *r = (int*) &reg;
printf("%s : %d %d %d %d\n", msg, r[0], r[1], r[2], r[3]);
}
#endif

// Determine whether a floating-point value is negative based on its sign bit.
// This function will treat special values, like -0, -NaN, -Inf, as they were indeed
Expand Down Expand Up @@ -170,7 +185,7 @@ inline __m128 sseLog2(__m128 x)
{
// y = log2( x ) = log2( 2^exponent * mantissa )
// = exponent + log2( mantissa )

__m128 mantissa
= _mm_or_ps( // OR with EONE
_mm_andnot_ps( // NOT(EMASK) AND x
Expand Down
31 changes: 27 additions & 4 deletions src/OpenColorIO/SSE2.h
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,35 @@
#if OCIO_USE_SSE2

// Include the appropriate SIMD intrinsics header based on the architecture (Intel vs. ARM).
#if !defined(__aarch64__)
#if !defined(__aarch64__) && !defined(_M_ARM64)
#include <emmintrin.h>
#elif defined(__aarch64__)
#elif defined(__aarch64__) || defined(_M_ARM64)
// ARM architecture A64 (ARM64)
#if OCIO_USE_SSE2NEON
// MSVC doesn't like the redefinitions below and requires the existing functions to be undef-ed
#if defined(_M_ARM64)
#define _mm_max_ps _mm_max_ps_orig
#define _mm_min_ps _mm_min_ps_orig
#endif

#include <sse2neon.h>

#if defined(_M_ARM64)
#undef _mm_max_ps
#undef _mm_min_ps
#endif

// Current versions of MSVC do not define float16_t, so we do it ourselves using
// int16_t as an intermediate type
#if defined(_M_ARM64) && !defined(float16_t)
#define float16_t int16_t
#endif

// Current versions of MSVC do not define vst1q_f16, so we do it ourselves using
// internal methods from MSVC's arm_neon.h
#if defined(_M_ARM64) && !defined(vst1q_f16)
#define vst1q_f16(A, B) neon_st1m_q16((A), __float16x8_t_to_n128(B));
#endif
#endif
#endif

Expand All @@ -30,7 +53,7 @@ namespace OCIO_NAMESPACE
// Note that it is important for the code below this ifdef stays in the OCIO_NAMESPACE since
// it is redefining two of the functions from sse2neon.

#if defined(__aarch64__)
#if defined(__aarch64__) || defined(_M_ARM64)
#if OCIO_USE_SSE2NEON
// Using vmaxnmq_f32 and vminnmq_f32 rather than sse2neon's vmaxq_f32 and vminq_f32 due to
// NaN handling. This doesn't seem to be significantly slower than the default sse2neon behavior.
Expand Down Expand Up @@ -321,7 +344,7 @@ struct SSE2RGBAPack<BIT_DEPTH_F16>
sse2RGBATranspose_4x4(r, g, b, a, rgba0, rgba1, rgba2, rgba3);

#if OCIO_USE_SSE2NEON
// use neon hardware support for f32 to f16
// use neon hardware support for f32 to f16 (apart from in MSVC, which doesnt support it)
float16x8_t rgba;
float16x4_t rgba00_01 = vcvt_f16_f32(vreinterpretq_f32_m128(rgba0));
float16x4_t rgba03_03 = vcvt_f16_f32(vreinterpretq_f32_m128(rgba1));
Expand Down
4 changes: 2 additions & 2 deletions src/OpenColorIO/ops/fixedfunction/FixedFunctionOpCPU.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1844,7 +1844,7 @@ __m128 Renderer_LIN_TO_PQ_SSE<true>::myPower(__m128 x, __m128 exp)
return ssePower(x, exp);
}

#ifdef _WIN32
#if (_MSC_VER >= 1920) && (OCIO_USE_AVX)
anthony-linaro marked this conversation as resolved.
Show resolved Hide resolved
// Only Windows compilers have built-in _mm_pow_ps() SVML intrinsic
// implementation, so non-fast SIMD version is available only on Windows for
// now.
Expand All @@ -1853,7 +1853,7 @@ __m128 Renderer_LIN_TO_PQ_SSE<false>::myPower(__m128 x, __m128 exp)
{
return _mm_pow_ps(x, exp);
}
#endif // _WIN32
#endif // (_MSC_VER >= 1920) && (OCIO_USE_AVX)

template<bool FAST_POWER>
void Renderer_LIN_TO_PQ_SSE<FAST_POWER>::apply(const void* inImg, void* outImg, long numPixels) const
Expand Down
Loading