diff --git a/changelog.md b/changelog.md index 788dd5de..00e1b544 100644 --- a/changelog.md +++ b/changelog.md @@ -6,6 +6,8 @@ * Fixed compatibility issues with the latests Python versions (up to 3.12). +* Fixed compilation on Apple Silicon (M1, M2). + * Fixed deprecation warnings from the latest versions of `numpy`. * Dropped support for Python 3.6. You must have Python 3.7 or newer to install this version. diff --git a/cppcore/CMakeLists.txt b/cppcore/CMakeLists.txt index 05d480c7..22cb40cf 100644 --- a/cppcore/CMakeLists.txt +++ b/cppcore/CMakeLists.txt @@ -116,7 +116,7 @@ download_dependency(variant 1.1.4 https://raw.githubusercontent.com/mapbox/variant/v\${VERSION}/include mapbox/variant.hpp mapbox/recursive_wrapper.hpp mapbox/variant_visitor.hpp) target_include_directories(cppcore SYSTEM PUBLIC ${VARIANT_INCLUDE_DIR}) -download_dependency(simdpp 2.0-rc2 +download_dependency(simdpp 2.1 https://github.com/p12tic/libsimdpp/archive /v\${VERSION}.tar.gz */simdpp) target_include_directories(cppcore SYSTEM PUBLIC ${SIMDPP_INCLUDE_DIR}) @@ -125,7 +125,11 @@ include(fmt) target_link_libraries(cppcore PUBLIC fmt) if(PB_NATIVE_SIMD AND NOT MSVC) # MSVC does not have anything like a /arch:native flag - target_compile_options(cppcore PUBLIC -march=native) + include(CheckCXXCompilerFlag) + check_cxx_compiler_flag(-march=native PB_HAS_ARCH_NATIVE) + if(PB_HAS_ARCH_NATIVE) + target_compile_options(cppcore PUBLIC -march=native) + endif() endif() if(PB_MKL) @@ -140,7 +144,7 @@ if(PB_CUDA) endif() if(PB_TESTS) - set(catch_url https://raw.githubusercontent.com/philsquared/Catch/v\${VERSION}/single_include) - download_dependency(catch 1.8.1 ${catch_url} catch.hpp) + set(catch_url https://raw.githubusercontent.com/catchorg/Catch2/v\${VERSION}/single_include/catch2) + download_dependency(catch 2.13.10 ${catch_url} catch.hpp) add_subdirectory(tests) endif() diff --git a/cppcore/include/support/simd.hpp b/cppcore/include/support/simd.hpp index 63ccca12..8541078c 100644 --- a/cppcore/include/support/simd.hpp +++ b/cppcore/include/support/simd.hpp @@ -8,6 +8,8 @@ # define SIMDPP_ARCH_X86_SSE3 #elif defined(__SSE2__) || defined(_M_X64) || _M_IX86_FP == 2 # define SIMDPP_ARCH_X86_SSE2 +#elif defined(__ARM_NEON) +# define SIMDPP_ARCH_ARM_NEON #endif #if defined(__FMA__) || (defined(_MSC_VER) && defined(__AVX2__)) @@ -138,8 +140,13 @@ split_loop_t split_loop(scalar_t const* p, idx_t start, idx_t end) { RAII class which disables floating-point denormals (flush-to-zero mode) */ struct scope_disable_denormals { +#if SIMDPP_USE_SSE2 CPB_ALWAYS_INLINE scope_disable_denormals() { _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); } CPB_ALWAYS_INLINE ~scope_disable_denormals() { _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF); } +#else // ARM NEON defaults for flush-to-zero + scope_disable_denormals() {} + ~scope_disable_denormals() {} +#endif }; namespace detail { @@ -179,7 +186,41 @@ namespace detail { return _mm_castpd_ps(r); } }; -#endif // SIMDPP_USE_SSE2 +#else // generic SIMD on ARM NEON or anything other than SSE/AVX + template<> + struct Gather { + CPB_ALWAYS_INLINE + static float64x2 call(double const* data, std::int32_t const* indices) { + auto const low = simdpp::load_splat(data + indices[0]); + auto const high = simdpp::load_splat(data + indices[1]); + return simdpp::zip2_lo(low, high); + } + + CPB_ALWAYS_INLINE + static float64x2 call(std::complex const* data, std::int32_t const* indices) { + return simdpp::load(data + indices[0]); + } + }; + + template<> + struct Gather { + CPB_ALWAYS_INLINE + static float32x4 call(float const* data, std::int32_t const* indices) { + auto const a = simdpp::load_splat(data + indices[0]); + auto const b = simdpp::load_splat(data + indices[1]); + auto const c = simdpp::load_splat(data + indices[2]); + auto const d = simdpp::load_splat(data + indices[3]); + auto const ac = simdpp::zip4_lo(a, c); + auto const bd = simdpp::zip4_lo(b, d); + return simdpp::zip4_lo(ac, bd); + } + + CPB_ALWAYS_INLINE + static float32x4 call(std::complex const* data, std::int32_t const* indices) { + return simdpp::bit_cast(Gather::call(reinterpret_cast(data), indices)); + } + }; +#endif #if SIMDPP_USE_AVX && !SIMDPP_USE_AVX2 template<> @@ -301,24 +342,24 @@ Vec addsub(Vec const& a, Vec const& b) { #if SIMDPP_USE_SSE3 template CPB_ALWAYS_INLINE float32x4 addsub(float32<4, E1> const& a, float32<4, E2> const& b) { - return _mm_addsub_ps(a.eval(), b.eval()); + return _mm_addsub_ps(a.eval().native(), b.eval().native()); } template CPB_ALWAYS_INLINE float64x2 addsub(float64<2, E1> const& a, float64<2, E2> const& b) { - return _mm_addsub_pd(a.eval(), b.eval()); + return _mm_addsub_pd(a.eval().native(), b.eval().native()); } #endif // SIMDPP_USE_SSE3 #if SIMDPP_USE_AVX template CPB_ALWAYS_INLINE float32x8 addsub(float32<8, E1> const& a, float32<8, E2> const& b) { - return _mm256_addsub_ps(a.eval(), b.eval()); + return _mm256_addsub_ps(a.eval().native(), b.eval().native()); } template CPB_ALWAYS_INLINE float64x4 addsub(float64<4, E1> const& a, float64<4, E2> const& b) { - return _mm256_addsub_pd(a.eval(), b.eval()); + return _mm256_addsub_pd(a.eval().native(), b.eval().native()); } #endif // SIMDPP_USE_AVX diff --git a/pybinding/utils/cpuinfo.py b/pybinding/utils/cpuinfo.py index 53b0c2f5..943b1387 100644 --- a/pybinding/utils/cpuinfo.py +++ b/pybinding/utils/cpuinfo.py @@ -52,12 +52,10 @@ def summary(): return "py-cpuinfo is not installed" info = info.copy() - hz_raw, scale = info['hz_advertised_raw'] - info['ghz'] = hz_raw * 10**(scale - 9) info['physical'] = physical_core_count() info['virtual'] = virtual_core_count() info['simd'] = _cpp.simd_info() - return "{brand}\n{physical}/{virtual} cores @ {ghz:.2g} GHz with {simd}".format_map(info) + return "{brand_raw}\n{physical}/{virtual} cores with {simd}".format_map(info) if __name__ == '__main__':