From 55cd52c1f9675c557bd1a88755a0604239055aea Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Sun, 13 Jul 2025 13:22:29 +0200 Subject: [PATCH 01/57] Tentative support for altivec --- .github/workflows/cross.yml | 1 + CMakeLists.txt | 3 + docs/Doxyfile | 1 + include/xsimd/arch/xsimd_altivec.hpp | 1837 +++++++++++++++++ include/xsimd/arch/xsimd_isa.hpp | 4 + include/xsimd/config/xsimd_config.hpp | 13 +- include/xsimd/types/xsimd_all_registers.hpp | 2 + .../xsimd/types/xsimd_altivec_register.hpp | 57 + 8 files changed, 1917 insertions(+), 1 deletion(-) create mode 100644 include/xsimd/arch/xsimd_altivec.hpp create mode 100644 include/xsimd/types/xsimd_altivec_register.hpp diff --git a/.github/workflows/cross.yml b/.github/workflows/cross.yml index 071e85f25..e71096f02 100644 --- a/.github/workflows/cross.yml +++ b/.github/workflows/cross.yml @@ -13,6 +13,7 @@ jobs: - { platform: 'arm', arch: 'armv7-a', dir: 'arm-linux-gnueabihf', flags: '-mfpu=neon', full: 'ON'} - { platform: 'arm', arch: 'armv7-a', dir: 'arm-linux-gnueabihf', flags: '-mfpu=vfpv3-d16', full: 'OFF' } # no neon - { platform: 'aarch64', arch: 'armv8-a', dir: 'aarch64-linux-gnu', flags: '', full: 'ON' } + - { platform: 'ppc', arch: 'powerpc', dir: 'powerpc-linux-gnu', flags: '-maltivec', full: 'OFF' } sys: - { compiler: 'gcc', version: '10' } steps: diff --git a/CMakeLists.txt b/CMakeLists.txt index 860a84bad..ea30d6814 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -35,6 +35,7 @@ message(STATUS "xsimd v${${PROJECT_NAME}_VERSION}") set(XSIMD_HEADERS ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_constants.hpp ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_fma3_avx.hpp +${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_fma3_avx.hpp ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_fma3_avx2.hpp ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_fma3_sse.hpp ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_fma4.hpp @@ -49,6 +50,7 @@ ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_sse3.hpp ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_sse4_1.hpp ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_sse4_2.hpp ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_ssse3.hpp +${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_altivec.hpp ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_sve.hpp ${XSIMD_INCLUDE_DIR}/xsimd/config/xsimd_arch.hpp ${XSIMD_INCLUDE_DIR}/xsimd/config/xsimd_config.hpp @@ -70,6 +72,7 @@ ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_fma3_sse_register.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_fma4_register.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_common_arch.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_register.hpp +${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_altivec_register.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_rvv_register.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_sse2_register.hpp ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_sse3_register.hpp diff --git a/docs/Doxyfile b/docs/Doxyfile index 390baf223..9de40e8da 100644 --- a/docs/Doxyfile +++ b/docs/Doxyfile @@ -9,6 +9,7 @@ INPUT = ../include/xsimd/types/xsimd_api.hpp \ ../include/xsimd/memory/xsimd_aligned_allocator.hpp \ ../include/xsimd/types/xsimd_common_arch.hpp \ ../include/xsimd/types/xsimd_traits.hpp \ + ../include/xsimd/types/xsimd_altivec_register.hpp \ ../include/xsimd/types/xsimd_avx2_register.hpp \ ../include/xsimd/types/xsimd_avx512bw_register.hpp \ ../include/xsimd/types/xsimd_avx512cd_register.hpp \ diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp new file mode 100644 index 000000000..23206ad5d --- /dev/null +++ b/include/xsimd/arch/xsimd_altivec.hpp @@ -0,0 +1,1837 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_ALTIVEC_HPP +#define XSIMD_ALTIVEC_HPP + +#include +#include +#include + +#include "../types/xsimd_altivec_register.hpp" + +namespace xsimd +{ + template + struct batch_bool_constant; + + template + XSIMD_INLINE batch bitwise_cast(batch const& x) noexcept; + + template + struct batch_constant; + + namespace kernel + { +#if 0 + using namespace types; + + namespace detail + { + constexpr uint32_t shuffle(uint32_t w, uint32_t x, uint32_t y, uint32_t z) + { + return (z << 6) | (y << 4) | (x << 2) | w; + } + constexpr uint32_t shuffle(uint32_t x, uint32_t y) + { + return (y << 1) | x; + } + + constexpr uint32_t mod_shuffle(uint32_t w, uint32_t x, uint32_t y, uint32_t z) + { + return shuffle(w % 4, x % 4, y % 4, z % 4); + } + + constexpr uint32_t mod_shuffle(uint32_t w, uint32_t x) + { + return shuffle(w % 2, x % 2); + } + } + + // fwd + template + XSIMD_INLINE batch insert(batch const& self, T val, index, requires_arch) noexcept; + template + XSIMD_INLINE batch shuffle(batch const& x, batch const& y, batch_constant, requires_arch) noexcept; + template + XSIMD_INLINE batch avg(batch const&, batch const&, requires_arch) noexcept; + template + XSIMD_INLINE batch avgr(batch const&, batch const&, requires_arch) noexcept; + + // abs + template + XSIMD_INLINE batch abs(batch const& self, requires_arch) noexcept + { + __m128d sign_mask = _mm_set1_pd(-0.f); // -0.f = 1 << 31 + return _mm_andnot_pd(sign_mask, self); + } + template + XSIMD_INLINE batch abs(batch const& self, requires_arch) noexcept + { + __m128 sign_mask = _mm_set1_ps(-0.f); // -0.f = 1 << 31 + return _mm_andnot_ps(sign_mask, self); + } + + // add + template ::value, void>::type> + XSIMD_INLINE batch add(batch const& self, batch const& other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm_add_epi8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm_add_epi16(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm_add_epi32(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm_add_epi64(self, other); + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + + template + XSIMD_INLINE batch add(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_add_ps(self, other); + } + + template + XSIMD_INLINE batch add(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_add_pd(self, other); + } + + // all + template + XSIMD_INLINE bool all(batch_bool const& self, requires_arch) noexcept + { + return _mm_movemask_ps(self) == 0x0F; + } + template + XSIMD_INLINE bool all(batch_bool const& self, requires_arch) noexcept + { + return _mm_movemask_pd(self) == 0x03; + } + template ::value, void>::type> + XSIMD_INLINE bool all(batch_bool const& self, requires_arch) noexcept + { + return _mm_movemask_epi8(self) == 0xFFFF; + } + + // any + template + XSIMD_INLINE bool any(batch_bool const& self, requires_arch) noexcept + { + return _mm_movemask_ps(self) != 0; + } + template + XSIMD_INLINE bool any(batch_bool const& self, requires_arch) noexcept + { + return _mm_movemask_pd(self) != 0; + } + template ::value, void>::type> + XSIMD_INLINE bool any(batch_bool const& self, requires_arch) noexcept + { + return _mm_movemask_epi8(self) != 0; + } + + // avgr + template ::value, void>::type> + XSIMD_INLINE batch avgr(batch const& self, batch const& other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm_avg_epu8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm_avg_epu16(self, other); + } + else + { + return avgr(self, other, common {}); + } + } + + // avg + template ::value, void>::type> + XSIMD_INLINE batch avg(batch const& self, batch const& other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + auto adj = ((self ^ other) << 7) >> 7; + return avgr(self, other, A {}) - adj; + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + auto adj = ((self ^ other) << 15) >> 15; + return avgr(self, other, A {}) - adj; + } + else + { + return avg(self, other, common {}); + } + } + + // batch_bool_cast + template + XSIMD_INLINE batch_bool batch_bool_cast(batch_bool const& self, batch_bool const&, requires_arch) noexcept + { + return { bitwise_cast(batch(self.data)).data }; + } + + // bitwise_and + template + XSIMD_INLINE batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_and_ps(self, other); + } + template + XSIMD_INLINE batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm_and_ps(self, other); + } + template ::value, void>::type> + XSIMD_INLINE batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_and_si128(self, other); + } + template ::value, void>::type> + XSIMD_INLINE batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm_and_si128(self, other); + } + + template + batch XSIMD_INLINE bitwise_and(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_and_pd(self, other); + } + + template + XSIMD_INLINE batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm_and_pd(self, other); + } + + // bitwise_andnot + template + XSIMD_INLINE batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_andnot_ps(other, self); + } + + template + XSIMD_INLINE batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm_andnot_ps(other, self); + } + template ::value, void>::type> + XSIMD_INLINE batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_andnot_si128(other, self); + } + template ::value, void>::type> + XSIMD_INLINE batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm_andnot_si128(other, self); + } + + template + XSIMD_INLINE batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_andnot_pd(other, self); + } + + template + XSIMD_INLINE batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm_andnot_pd(other, self); + } + + // bitwise_lshift + template ::value, void>::type> + XSIMD_INLINE batch bitwise_lshift(batch const& self, int32_t other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm_and_si128(_mm_set1_epi8(0xFF << other), _mm_slli_epi32(self, other)); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm_slli_epi16(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm_slli_epi32(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm_slli_epi64(self, other); + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + + // bitwise_not + template + XSIMD_INLINE batch bitwise_not(batch const& self, requires_arch) noexcept + { + return _mm_xor_ps(self, _mm_castsi128_ps(_mm_set1_epi32(-1))); + } + template + XSIMD_INLINE batch_bool bitwise_not(batch_bool const& self, requires_arch) noexcept + { + return _mm_xor_ps(self, _mm_castsi128_ps(_mm_set1_epi32(-1))); + } + template ::value, void>::type> + XSIMD_INLINE batch bitwise_not(batch const& self, requires_arch) noexcept + { + return _mm_xor_si128(self, _mm_set1_epi32(-1)); + } + template ::value, void>::type> + XSIMD_INLINE batch_bool bitwise_not(batch_bool const& self, requires_arch) noexcept + { + return _mm_xor_si128(self, _mm_set1_epi32(-1)); + } + template + XSIMD_INLINE batch bitwise_not(batch const& self, requires_arch) noexcept + { + return _mm_xor_pd(self, _mm_castsi128_pd(_mm_set1_epi32(-1))); + } + template + XSIMD_INLINE batch_bool bitwise_not(batch_bool const& self, requires_arch) noexcept + { + return _mm_xor_pd(self, _mm_castsi128_pd(_mm_set1_epi32(-1))); + } + + // bitwise_or + template + XSIMD_INLINE batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_or_ps(self, other); + } + template + XSIMD_INLINE batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm_or_ps(self, other); + } + template ::value, void>::type> + XSIMD_INLINE batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_or_si128(self, other); + } + template ::value, void>::type> + XSIMD_INLINE batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm_or_si128(self, other); + } + + template + XSIMD_INLINE batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_or_pd(self, other); + } + + template + XSIMD_INLINE batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm_or_pd(self, other); + } + + // bitwise_rshift + template ::value, void>::type> + XSIMD_INLINE batch bitwise_rshift(batch const& self, int32_t other, requires_arch) noexcept + { + if (std::is_signed::value) + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + __m128i sign_mask = _mm_set1_epi16((0xFF00 >> other) & 0x00FF); + __m128i cmp_is_negative = _mm_cmpgt_epi8(_mm_setzero_si128(), self); + __m128i res = _mm_srai_epi16(self, other); + return _mm_or_si128(_mm_and_si128(sign_mask, cmp_is_negative), _mm_andnot_si128(sign_mask, res)); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm_srai_epi16(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm_srai_epi32(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + // from https://github.com/samyvilar/vect/blob/master/vect_128.h + return _mm_or_si128( + _mm_srli_epi64(self, other), + _mm_slli_epi64( + _mm_srai_epi32(_mm_shuffle_epi32(self, _MM_SHUFFLE(3, 3, 1, 1)), 32), + 64 - other)); + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + else + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm_and_si128(_mm_set1_epi8(0xFF >> other), _mm_srli_epi32(self, other)); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm_srli_epi16(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm_srli_epi32(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm_srli_epi64(self, other); + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + } + + // bitwise_xor + template + XSIMD_INLINE batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_xor_ps(self, other); + } + template + XSIMD_INLINE batch_bool bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm_xor_ps(self, other); + } + template ::value, void>::type> + XSIMD_INLINE batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_xor_si128(self, other); + } + template + XSIMD_INLINE batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_xor_pd(self, other); + } + template + XSIMD_INLINE batch_bool bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm_xor_pd(self, other); + } + template ::value, void>::type> + XSIMD_INLINE batch bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm_xor_si128(self, other); + } + + // bitwise_cast + template ::value, void>::type> + XSIMD_INLINE batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm_castsi128_ps(self); + } + template ::type>::value, void>::type> + XSIMD_INLINE batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept + { + return batch(self.data); + } + template ::value, void>::type> + XSIMD_INLINE batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm_castps_si128(self); + } + template ::value, void>::type> + XSIMD_INLINE batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm_castsi128_pd(self); + } + template + XSIMD_INLINE batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm_castps_pd(self); + } + template + XSIMD_INLINE batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm_castpd_ps(self); + } + template ::value, void>::type> + XSIMD_INLINE batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm_castpd_si128(self); + } + + // broadcast + template + batch XSIMD_INLINE broadcast(float val, requires_arch) noexcept + { + return _mm_set1_ps(val); + } + template ::value, void>::type> + XSIMD_INLINE batch broadcast(T val, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm_set1_epi8(val); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm_set1_epi16(val); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm_set1_epi32(val); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm_set1_epi64x(val); + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + template + XSIMD_INLINE batch broadcast(double val, requires_arch) noexcept + { + return _mm_set1_pd(val); + } + + // store_complex + namespace detail + { + // Override these methods in SSE-based archs, no need to override store_aligned / store_unaligned + // complex_low + template + XSIMD_INLINE batch complex_low(batch, A> const& self, requires_arch) noexcept + { + return _mm_unpacklo_ps(self.real(), self.imag()); + } + // complex_high + template + XSIMD_INLINE batch complex_high(batch, A> const& self, requires_arch) noexcept + { + return _mm_unpackhi_ps(self.real(), self.imag()); + } + template + XSIMD_INLINE batch complex_low(batch, A> const& self, requires_arch) noexcept + { + return _mm_unpacklo_pd(self.real(), self.imag()); + } + template + XSIMD_INLINE batch complex_high(batch, A> const& self, requires_arch) noexcept + { + return _mm_unpackhi_pd(self.real(), self.imag()); + } + } + + // decr_if + template ::value, void>::type> + XSIMD_INLINE batch decr_if(batch const& self, batch_bool const& mask, requires_arch) noexcept + { + return self + batch(mask.data); + } + + // div + template + XSIMD_INLINE batch div(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_div_ps(self, other); + } + template + XSIMD_INLINE batch div(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_div_pd(self, other); + } + + // fast_cast + namespace detail + { + template + XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm_cvtepi32_ps(self); + } + + template + XSIMD_INLINE batch fast_cast(batch const& x, batch const&, requires_arch) noexcept + { + // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx + // adapted to sse2 + __m128i xH = _mm_srli_epi64(x, 32); + xH = _mm_or_si128(xH, _mm_castpd_si128(_mm_set1_pd(19342813113834066795298816.))); // 2^84 + __m128i mask = _mm_setr_epi16(0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000); + __m128i xL = _mm_or_si128(_mm_and_si128(mask, x), _mm_andnot_si128(mask, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)))); // 2^52 + __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52 + return _mm_add_pd(f, _mm_castsi128_pd(xL)); + } + + template + XSIMD_INLINE batch fast_cast(batch const& x, batch const&, requires_arch) noexcept + { + // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx + // adapted to sse2 + __m128i xH = _mm_srai_epi32(x, 16); + xH = _mm_and_si128(xH, _mm_setr_epi16(0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF)); + xH = _mm_add_epi64(xH, _mm_castpd_si128(_mm_set1_pd(442721857769029238784.))); // 3*2^67 + __m128i mask = _mm_setr_epi16(0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000); + __m128i xL = _mm_or_si128(_mm_and_si128(mask, x), _mm_andnot_si128(mask, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)))); // 2^52 + __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(442726361368656609280.)); // 3*2^67 + 2^52 + return _mm_add_pd(f, _mm_castsi128_pd(xL)); + } + + template + XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch) noexcept + { + return _mm_cvttps_epi32(self); + } + } + + // eq + template + XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_cmpeq_ps(self, other); + } + template + XSIMD_INLINE batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(self), _mm_castps_si128(other))); + } + template ::value, void>::type> + XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm_cmpeq_epi8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm_cmpeq_epi16(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm_cmpeq_epi32(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + __m128i tmp1 = _mm_cmpeq_epi32(self, other); + __m128i tmp2 = _mm_shuffle_epi32(tmp1, 0xB1); + __m128i tmp3 = _mm_and_si128(tmp1, tmp2); + __m128i tmp4 = _mm_srai_epi32(tmp3, 31); + return _mm_shuffle_epi32(tmp4, 0xF5); + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + template ::value, void>::type> + XSIMD_INLINE batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return ~(self != other); + } + template + XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_cmpeq_pd(self, other); + } + template + XSIMD_INLINE batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm_castsi128_pd(_mm_cmpeq_epi32(_mm_castpd_si128(self), _mm_castpd_si128(other))); + } + + // first + template + XSIMD_INLINE float first(batch const& self, requires_arch) noexcept + { + return _mm_cvtss_f32(self); + } + + template + XSIMD_INLINE double first(batch const& self, requires_arch) noexcept + { + return _mm_cvtsd_f64(self); + } + + template ::value, void>::type> + XSIMD_INLINE T first(batch const& self, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return static_cast(_mm_cvtsi128_si32(self) & 0xFF); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return static_cast(_mm_cvtsi128_si32(self) & 0xFFFF); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return static_cast(_mm_cvtsi128_si32(self)); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { +#if defined(__x86_64__) + return static_cast(_mm_cvtsi128_si64(self)); +#else + __m128i m; + _mm_storel_epi64(&m, self); + int64_t i; + std::memcpy(&i, &m, sizeof(i)); + return i; +#endif + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + + // from_mask + template + XSIMD_INLINE batch_bool from_mask(batch_bool const&, uint64_t mask, requires_arch) noexcept + { + alignas(A::alignment()) static const uint32_t lut[][4] = { + { 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, + { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 }, + { 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000 }, + { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 }, + { 0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000 }, + { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 }, + { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 }, + { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 }, + { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF }, + { 0xFFFFFFFF, 0x00000000, 0x00000000, 0xFFFFFFFF }, + { 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF }, + { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF }, + { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF }, + { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF }, + { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }, + { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF }, + }; + assert(!(mask & ~0xFul) && "inbound mask"); + return _mm_castsi128_ps(_mm_load_si128((const __m128i*)lut[mask])); + } + template + XSIMD_INLINE batch_bool from_mask(batch_bool const&, uint64_t mask, requires_arch) noexcept + { + alignas(A::alignment()) static const uint64_t lut[][4] = { + { 0x0000000000000000ul, 0x0000000000000000ul }, + { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul }, + { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul }, + { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul }, + }; + assert(!(mask & ~0x3ul) && "inbound mask"); + return _mm_castsi128_pd(_mm_load_si128((const __m128i*)lut[mask])); + } + template ::value, void>::type> + XSIMD_INLINE batch_bool from_mask(batch_bool const&, uint64_t mask, requires_arch) noexcept + { + alignas(A::alignment()) static const uint64_t lut64[] = { + 0x0000000000000000, + 0x000000000000FFFF, + 0x00000000FFFF0000, + 0x00000000FFFFFFFF, + 0x0000FFFF00000000, + 0x0000FFFF0000FFFF, + 0x0000FFFFFFFF0000, + 0x0000FFFFFFFFFFFF, + 0xFFFF000000000000, + 0xFFFF00000000FFFF, + 0xFFFF0000FFFF0000, + 0xFFFF0000FFFFFFFF, + 0xFFFFFFFF00000000, + 0xFFFFFFFF0000FFFF, + 0xFFFFFFFFFFFF0000, + 0xFFFFFFFFFFFFFFFF, + }; + alignas(A::alignment()) static const uint32_t lut32[] = { + 0x00000000, + 0x000000FF, + 0x0000FF00, + 0x0000FFFF, + 0x00FF0000, + 0x00FF00FF, + 0x00FFFF00, + 0x00FFFFFF, + 0xFF000000, + 0xFF0000FF, + 0xFF00FF00, + 0xFF00FFFF, + 0xFFFF0000, + 0xFFFF00FF, + 0xFFFFFF00, + 0xFFFFFFFF, + }; + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + assert(!(mask & ~0xFFFF) && "inbound mask"); + return _mm_setr_epi32(lut32[mask & 0xF], lut32[(mask >> 4) & 0xF], lut32[(mask >> 8) & 0xF], lut32[mask >> 12]); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + assert(!(mask & ~0xFF) && "inbound mask"); + return _mm_set_epi64x(lut64[mask >> 4], lut64[mask & 0xF]); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm_castps_si128(from_mask(batch_bool {}, mask, sse2 {})); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm_castpd_si128(from_mask(batch_bool {}, mask, sse2 {})); + } + } + + // ge + template + XSIMD_INLINE batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_cmpge_ps(self, other); + } + template + XSIMD_INLINE batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_cmpge_pd(self, other); + } + + // gt + template + XSIMD_INLINE batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_cmpgt_ps(self, other); + } + template ::value, void>::type> + XSIMD_INLINE batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm_cmpgt_epi8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm_cmpgt_epi16(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm_cmpgt_epi32(self, other); + } + else + { + return gt(self, other, common {}); + } + } + else + { + return gt(self, other, common {}); + } + } + + template + XSIMD_INLINE batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_cmpgt_pd(self, other); + } + + // haddp + template + XSIMD_INLINE batch haddp(batch const* row, requires_arch) noexcept + { + __m128 tmp0 = _mm_unpacklo_ps(row[0], row[1]); + __m128 tmp1 = _mm_unpackhi_ps(row[0], row[1]); + __m128 tmp2 = _mm_unpackhi_ps(row[2], row[3]); + tmp0 = _mm_add_ps(tmp0, tmp1); + tmp1 = _mm_unpacklo_ps(row[2], row[3]); + tmp1 = _mm_add_ps(tmp1, tmp2); + tmp2 = _mm_movehl_ps(tmp1, tmp0); + tmp0 = _mm_movelh_ps(tmp0, tmp1); + return _mm_add_ps(tmp0, tmp2); + } + template + XSIMD_INLINE batch haddp(batch const* row, requires_arch) noexcept + { + return _mm_add_pd(_mm_unpacklo_pd(row[0], row[1]), + _mm_unpackhi_pd(row[0], row[1])); + } + + // incr_if + template ::value, void>::type> + XSIMD_INLINE batch incr_if(batch const& self, batch_bool const& mask, requires_arch) noexcept + { + return self - batch(mask.data); + } + + // insert + template ::value, void>::type> + XSIMD_INLINE batch insert(batch const& self, T val, index pos, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm_insert_epi16(self, val, I); + } + else + { + return insert(self, val, pos, common {}); + } + } + + // isnan + template + XSIMD_INLINE batch_bool isnan(batch const& self, requires_arch) noexcept + { + return _mm_cmpunord_ps(self, self); + } + template + XSIMD_INLINE batch_bool isnan(batch const& self, requires_arch) noexcept + { + return _mm_cmpunord_pd(self, self); + } + + // load_aligned + template + XSIMD_INLINE batch load_aligned(float const* mem, convert, requires_arch) noexcept + { + return _mm_load_ps(mem); + } + template ::value, void>::type> + XSIMD_INLINE batch load_aligned(T const* mem, convert, requires_arch) noexcept + { + return _mm_load_si128((__m128i const*)mem); + } + template + XSIMD_INLINE batch load_aligned(double const* mem, convert, requires_arch) noexcept + { + return _mm_load_pd(mem); + } + + // load_unaligned + template + XSIMD_INLINE batch load_unaligned(float const* mem, convert, requires_arch) noexcept + { + return _mm_loadu_ps(mem); + } + template ::value, void>::type> + XSIMD_INLINE batch load_unaligned(T const* mem, convert, requires_arch) noexcept + { + return _mm_loadu_si128((__m128i const*)mem); + } + template + XSIMD_INLINE batch load_unaligned(double const* mem, convert, requires_arch) noexcept + { + return _mm_loadu_pd(mem); + } + + // load_complex + namespace detail + { + // Redefine these methods in the SSE-based archs if required + template + XSIMD_INLINE batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) noexcept + { + return { _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(2, 0, 2, 0)), _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(3, 1, 3, 1)) }; + } + template + XSIMD_INLINE batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) noexcept + { + return { _mm_shuffle_pd(hi, lo, _MM_SHUFFLE2(0, 0)), _mm_shuffle_pd(hi, lo, _MM_SHUFFLE2(1, 1)) }; + } + } + + // le + template + XSIMD_INLINE batch_bool le(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_cmple_ps(self, other); + } + template + XSIMD_INLINE batch_bool le(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_cmple_pd(self, other); + } + + // lt + template + XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_cmplt_ps(self, other); + } + template ::value, void>::type> + XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm_cmplt_epi8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm_cmplt_epi16(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm_cmplt_epi32(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + __m128i tmp1 = _mm_sub_epi64(self, other); + __m128i tmp2 = _mm_xor_si128(self, other); + __m128i tmp3 = _mm_andnot_si128(other, self); + __m128i tmp4 = _mm_andnot_si128(tmp2, tmp1); + __m128i tmp5 = _mm_or_si128(tmp3, tmp4); + __m128i tmp6 = _mm_srai_epi32(tmp5, 31); + return _mm_shuffle_epi32(tmp6, 0xF5); + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + else + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm_cmplt_epi8(_mm_xor_si128(self, _mm_set1_epi8(std::numeric_limits::lowest())), _mm_xor_si128(other, _mm_set1_epi8(std::numeric_limits::lowest()))); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm_cmplt_epi16(_mm_xor_si128(self, _mm_set1_epi16(std::numeric_limits::lowest())), _mm_xor_si128(other, _mm_set1_epi16(std::numeric_limits::lowest()))); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm_cmplt_epi32(_mm_xor_si128(self, _mm_set1_epi32(std::numeric_limits::lowest())), _mm_xor_si128(other, _mm_set1_epi32(std::numeric_limits::lowest()))); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + auto xself = _mm_xor_si128(self, _mm_set1_epi64x(std::numeric_limits::lowest())); + auto xother = _mm_xor_si128(other, _mm_set1_epi64x(std::numeric_limits::lowest())); + __m128i tmp1 = _mm_sub_epi64(xself, xother); + __m128i tmp2 = _mm_xor_si128(xself, xother); + __m128i tmp3 = _mm_andnot_si128(xother, xself); + __m128i tmp4 = _mm_andnot_si128(tmp2, tmp1); + __m128i tmp5 = _mm_or_si128(tmp3, tmp4); + __m128i tmp6 = _mm_srai_epi32(tmp5, 31); + return _mm_shuffle_epi32(tmp6, 0xF5); + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + } + + template + XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_cmplt_pd(self, other); + } + + /* compression table to turn 0b10 into 0b1, + * 0b100010 into 0b101 etc + */ + namespace detail + { + XSIMD_INLINE int mask_lut(uint64_t mask) + { + // clang-format off + static const int mask_lut[256] = { + 0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x2, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x4, 0x0, 0x5, 0x0, 0x0, 0x0, 0x0, 0x0, 0x6, 0x0, 0x7, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x8, 0x0, 0x9, 0x0, 0x0, 0x0, 0x0, 0x0, 0xA, 0x0, 0xB, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0xC, 0x0, 0xD, 0x0, 0x0, 0x0, 0x0, 0x0, 0xE, 0x0, 0xF, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, + }; + // clang-format on + return mask_lut[mask & 0xAA]; + } + } + + // mask + template ::value, void>::type> + XSIMD_INLINE uint64_t mask(batch_bool const& self, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm_movemask_epi8(self); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + uint64_t mask8 = _mm_movemask_epi8(self); + return detail::mask_lut(mask8) | (detail::mask_lut(mask8 >> 8) << 4); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm_movemask_ps(_mm_castsi128_ps(self)); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm_movemask_pd(_mm_castsi128_pd(self)); + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + template + XSIMD_INLINE uint64_t mask(batch_bool const& self, requires_arch) noexcept + { + return _mm_movemask_ps(self); + } + + template + XSIMD_INLINE uint64_t mask(batch_bool const& self, requires_arch) noexcept + { + return _mm_movemask_pd(self); + } + + // max + template + XSIMD_INLINE batch max(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_max_ps(self, other); + } + template ::value, void>::type> + XSIMD_INLINE batch max(batch const& self, batch const& other, requires_arch) noexcept + { + return select(self > other, self, other); + } + template + XSIMD_INLINE batch max(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_max_pd(self, other); + } + + // min + template + XSIMD_INLINE batch min(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_min_ps(self, other); + } + template ::value, void>::type> + XSIMD_INLINE batch min(batch const& self, batch const& other, requires_arch) noexcept + { + return select(self <= other, self, other); + } + template + XSIMD_INLINE batch min(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_min_pd(self, other); + } + + // mul + template + XSIMD_INLINE batch mul(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_mul_ps(self, other); + } + template + XSIMD_INLINE batch mul(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_mul_pd(self, other); + } + + // mul + template + XSIMD_INLINE batch mul(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_mullo_epi16(self, other); + } + + // nearbyint_as_int + template + XSIMD_INLINE batch nearbyint_as_int(batch const& self, + requires_arch) noexcept + { + return _mm_cvtps_epi32(self); + } + + // neg + template ::value, void>::type> + XSIMD_INLINE batch neg(batch const& self, requires_arch) noexcept + { + return 0 - self; + } + template + XSIMD_INLINE batch neg(batch const& self, requires_arch) noexcept + { + return _mm_xor_ps(self, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))); + } + + template + XSIMD_INLINE batch neg(batch const& self, requires_arch) noexcept + { + return _mm_xor_pd( + self, _mm_castsi128_pd(_mm_setr_epi32(0, 0x80000000, 0, 0x80000000))); + } + + // neq + template + XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_cmpneq_ps(self, other); + } + template ::value, void>::type> + XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept + { + return ~(self == other); + } + template + XSIMD_INLINE batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm_xor_ps(self, other); + } + template ::value, void>::type> + XSIMD_INLINE batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(self.data), _mm_castsi128_ps(other.data))); + } + + template + XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_cmpneq_pd(self, other); + } + template + XSIMD_INLINE batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return _mm_xor_pd(self, other); + } + + // reciprocal + template + XSIMD_INLINE batch reciprocal(batch const& self, + kernel::requires_arch) + { + return _mm_rcp_ps(self); + } + + // reduce_add + template + XSIMD_INLINE float reduce_add(batch const& self, requires_arch) noexcept + { + __m128 tmp0 = _mm_add_ps(self, _mm_movehl_ps(self, self)); + __m128 tmp1 = _mm_add_ss(tmp0, _mm_shuffle_ps(tmp0, tmp0, 1)); + return _mm_cvtss_f32(tmp1); + } + + template ::value, void>::type> + XSIMD_INLINE T reduce_add(batch const& self, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + __m128i tmp1 = _mm_shuffle_epi32(self, 0x0E); + __m128i tmp2 = _mm_add_epi32(self, tmp1); + __m128i tmp3 = _mm_shuffle_epi32(tmp2, 0x01); + __m128i tmp4 = _mm_add_epi32(tmp2, tmp3); + return _mm_cvtsi128_si32(tmp4); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + __m128i tmp1 = _mm_shuffle_epi32(self, 0x0E); + __m128i tmp2 = _mm_add_epi64(self, tmp1); +#if defined(__x86_64__) + return _mm_cvtsi128_si64(tmp2); +#else + __m128i m; + _mm_storel_epi64(&m, tmp2); + int64_t i; + std::memcpy(&i, &m, sizeof(i)); + return i; +#endif + } + else + { + return hadd(self, common {}); + } + } + + template + XSIMD_INLINE double reduce_add(batch const& self, requires_arch) noexcept + { + return _mm_cvtsd_f64(_mm_add_sd(self, _mm_unpackhi_pd(self, self))); + } + + // reduce_max + template ::type> + XSIMD_INLINE T reduce_max(batch const& self, requires_arch) noexcept + { + constexpr auto mask0 = detail::shuffle(2, 3, 0, 0); + batch step0 = _mm_shuffle_epi32(self, mask0); + batch acc0 = max(self, step0); + + constexpr auto mask1 = detail::shuffle(1, 0, 0, 0); + batch step1 = _mm_shuffle_epi32(acc0, mask1); + batch acc1 = max(acc0, step1); + + constexpr auto mask2 = detail::shuffle(1, 0, 0, 0); + batch step2 = _mm_shufflelo_epi16(acc1, mask2); + batch acc2 = max(acc1, step2); + if (sizeof(T) == 2) + return first(acc2, A {}); + batch step3 = bitwise_cast(bitwise_cast(acc2) >> 8); + batch acc3 = max(acc2, step3); + return first(acc3, A {}); + } + + // reduce_min + template ::type> + XSIMD_INLINE T reduce_min(batch const& self, requires_arch) noexcept + { + constexpr auto mask0 = detail::shuffle(2, 3, 0, 0); + batch step0 = _mm_shuffle_epi32(self, mask0); + batch acc0 = min(self, step0); + + constexpr auto mask1 = detail::shuffle(1, 0, 0, 0); + batch step1 = _mm_shuffle_epi32(acc0, mask1); + batch acc1 = min(acc0, step1); + + constexpr auto mask2 = detail::shuffle(1, 0, 0, 0); + batch step2 = _mm_shufflelo_epi16(acc1, mask2); + batch acc2 = min(acc1, step2); + if (sizeof(T) == 2) + return first(acc2, A {}); + batch step3 = bitwise_cast(bitwise_cast(acc2) >> 8); + batch acc3 = min(acc2, step3); + return first(acc3, A {}); + } + + // rsqrt + template + XSIMD_INLINE batch rsqrt(batch const& val, requires_arch) noexcept + { + return _mm_rsqrt_ps(val); + } + template + XSIMD_INLINE batch rsqrt(batch const& val, requires_arch) noexcept + { + return _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(val))); + } + + // select + template + XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + return _mm_or_ps(_mm_and_ps(cond, true_br), _mm_andnot_ps(cond, false_br)); + } + + template ::value, void>::type> + XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + return _mm_or_si128(_mm_and_si128(cond, true_br), _mm_andnot_si128(cond, false_br)); + } + template ::value, void>::type> + XSIMD_INLINE batch select(batch_bool_constant const&, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + return select(batch_bool { Values... }, true_br, false_br, sse2 {}); + } + template + XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + return _mm_or_pd(_mm_and_pd(cond, true_br), _mm_andnot_pd(cond, false_br)); + } + + // shuffle + template + XSIMD_INLINE batch shuffle(batch const& x, batch const& y, batch_constant mask, requires_arch) noexcept + { + constexpr uint32_t smask = detail::mod_shuffle(I0, I1, I2, I3); + // shuffle within lane + if (I0 < 4 && I1 < 4 && I2 >= 4 && I3 >= 4) + return _mm_shuffle_ps(x, y, smask); + + // shuffle within opposite lane + if (I0 >= 4 && I1 >= 4 && I2 < 4 && I3 < 4) + return _mm_shuffle_ps(y, x, smask); + return shuffle(x, y, mask, common {}); + } + + template + XSIMD_INLINE batch shuffle(batch const& x, batch const& y, batch_constant mask, requires_arch) noexcept + { + constexpr uint32_t smask = detail::mod_shuffle(I0, I1); + // shuffle within lane + if (I0 < 2 && I1 >= 2) + return _mm_shuffle_pd(x, y, smask); + + // shuffle within opposite lane + if (I0 >= 2 && I1 < 2) + return _mm_shuffle_pd(y, x, smask); + return shuffle(x, y, mask, common {}); + } + + // sqrt + template + XSIMD_INLINE batch sqrt(batch const& val, requires_arch) noexcept + { + return _mm_sqrt_ps(val); + } + template + XSIMD_INLINE batch sqrt(batch const& val, requires_arch) noexcept + { + return _mm_sqrt_pd(val); + } + + // slide_left + template + XSIMD_INLINE batch slide_left(batch const& x, requires_arch) noexcept + { + return _mm_slli_si128(x, N); + } + + // slide_right + template + XSIMD_INLINE batch slide_right(batch const& x, requires_arch) noexcept + { + return _mm_srli_si128(x, N); + } + + // sadd + + template ::value, void>::type> + XSIMD_INLINE batch sadd(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm_adds_epi8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm_adds_epi16(self, other); + } + else + { + return sadd(self, other, common {}); + } + } + else + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm_adds_epu8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm_adds_epu16(self, other); + } + else + { + return sadd(self, other, common {}); + } + } + } + + // set + template + XSIMD_INLINE batch set(batch const&, requires_arch, Values... values) noexcept + { + static_assert(sizeof...(Values) == batch::size, "consistent init"); + return _mm_setr_ps(values...); + } + + template ::value, void>::type> + XSIMD_INLINE batch set(batch const&, requires_arch, T v0, T v1) noexcept + { + return _mm_set_epi64x(v1, v0); + } + template ::value, void>::type> + XSIMD_INLINE batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3) noexcept + { + return _mm_setr_epi32(v0, v1, v2, v3); + } + template ::value, void>::type> + XSIMD_INLINE batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) noexcept + { + return _mm_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7); + } + template ::value, void>::type> + XSIMD_INLINE batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) noexcept + { + return _mm_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15); + } + + template + XSIMD_INLINE batch set(batch const&, requires_arch, Values... values) noexcept + { + static_assert(sizeof...(Values) == batch::size, "consistent init"); + return _mm_setr_pd(values...); + } + + template ::value, void>::type> + XSIMD_INLINE batch_bool set(batch_bool const&, requires_arch, Values... values) noexcept + { + return set(batch(), A {}, static_cast(values ? -1LL : 0LL)...).data; + } + + template + XSIMD_INLINE batch_bool set(batch_bool const&, requires_arch, Values... values) noexcept + { + static_assert(sizeof...(Values) == batch_bool::size, "consistent init"); + return _mm_castsi128_ps(set(batch(), A {}, static_cast(values ? -1LL : 0LL)...).data); + } + + template + XSIMD_INLINE batch_bool set(batch_bool const&, requires_arch, Values... values) noexcept + { + static_assert(sizeof...(Values) == batch_bool::size, "consistent init"); + return _mm_castsi128_pd(set(batch(), A {}, static_cast(values ? -1LL : 0LL)...).data); + } + + // ssub + + template ::value, void>::type> + XSIMD_INLINE batch ssub(batch const& self, batch const& other, requires_arch) noexcept + { + if (std::is_signed::value) + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm_subs_epi8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm_subs_epi16(self, other); + } + else + { + return ssub(self, other, common {}); + } + } + else + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm_subs_epu8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm_subs_epu16(self, other); + } + else + { + return ssub(self, other, common {}); + } + } + } + + // store_aligned + template + XSIMD_INLINE void store_aligned(float* mem, batch const& self, requires_arch) noexcept + { + return _mm_store_ps(mem, self); + } + template ::value, void>::type> + XSIMD_INLINE void store_aligned(T* mem, batch const& self, requires_arch) noexcept + { + return _mm_store_si128((__m128i*)mem, self); + } + template ::value, void>::type> + XSIMD_INLINE void store_aligned(T* mem, batch_bool const& self, requires_arch) noexcept + { + return _mm_store_si128((__m128i*)mem, self); + } + template + XSIMD_INLINE void store_aligned(double* mem, batch const& self, requires_arch) noexcept + { + return _mm_store_pd(mem, self); + } + + // store_unaligned + template + XSIMD_INLINE void store_unaligned(float* mem, batch const& self, requires_arch) noexcept + { + return _mm_storeu_ps(mem, self); + } + template ::value, void>::type> + XSIMD_INLINE void store_unaligned(T* mem, batch const& self, requires_arch) noexcept + { + return _mm_storeu_si128((__m128i*)mem, self); + } + template ::value, void>::type> + XSIMD_INLINE void store_unaligned(T* mem, batch_bool const& self, requires_arch) noexcept + { + return _mm_storeu_si128((__m128i*)mem, self); + } + template + XSIMD_INLINE void store_unaligned(double* mem, batch const& self, requires_arch) noexcept + { + return _mm_storeu_pd(mem, self); + } + + // sub + template + XSIMD_INLINE batch sub(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_sub_ps(self, other); + } + template ::value, void>::type> + XSIMD_INLINE batch sub(batch const& self, batch const& other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm_sub_epi8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm_sub_epi16(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm_sub_epi32(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm_sub_epi64(self, other); + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + template + XSIMD_INLINE batch sub(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_sub_pd(self, other); + } + + // swizzle + + template + XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept + { + constexpr uint32_t index = detail::shuffle(V0, V1, V2, V3); + return _mm_shuffle_ps(self, self, index); + } + + template + XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept + { + constexpr uint32_t index = detail::shuffle(V0, V1); + return _mm_shuffle_pd(self, self, index); + } + + template + XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept + { + constexpr uint32_t index = detail::shuffle(2 * V0, 2 * V0 + 1, 2 * V1, 2 * V1 + 1); + return _mm_shuffle_epi32(self, index); + } + + template + XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept + { + return bitwise_cast(swizzle(bitwise_cast(self), mask, sse2 {})); + } + + template + XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept + { + constexpr uint32_t index = detail::shuffle(V0, V1, V2, V3); + return _mm_shuffle_epi32(self, index); + } + + template + XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept + { + return bitwise_cast(swizzle(bitwise_cast(self), mask, sse2 {})); + } + + template + XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept + { + // permute within each lane + constexpr auto mask_lo = detail::mod_shuffle(V0, V1, V2, V3); + constexpr auto mask_hi = detail::mod_shuffle(V4, V5, V6, V7); + __m128i lo = _mm_shufflelo_epi16(self, mask_lo); + __m128i hi = _mm_shufflehi_epi16(self, mask_hi); + + __m128i lo_lo = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(lo), _mm_castsi128_pd(lo), _MM_SHUFFLE2(0, 0))); + __m128i hi_hi = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(hi), _mm_castsi128_pd(hi), _MM_SHUFFLE2(1, 1))); + + // mask to choose the right lane + batch_bool_constant blend_mask; + + // blend the two permutes + return select(blend_mask, batch(lo_lo), batch(hi_hi)); + } + + template + XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept + { + return bitwise_cast(swizzle(bitwise_cast(self), mask, sse2 {})); + } + + // transpose + template + XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept + { + assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); + (void)matrix_end; + auto r0 = matrix_begin[0], r1 = matrix_begin[1], r2 = matrix_begin[2], r3 = matrix_begin[3]; + _MM_TRANSPOSE4_PS(r0, r1, r2, r3); + matrix_begin[0] = r0; + matrix_begin[1] = r1; + matrix_begin[2] = r2; + matrix_begin[3] = r3; + } + template + XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept + { + transpose(reinterpret_cast*>(matrix_begin), reinterpret_cast*>(matrix_end), A {}); + } + template + XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept + { + transpose(reinterpret_cast*>(matrix_begin), reinterpret_cast*>(matrix_end), A {}); + } + + template + XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept + { + assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); + (void)matrix_end; + auto r0 = matrix_begin[0], r1 = matrix_begin[1]; + matrix_begin[0] = _mm_unpacklo_pd(r0, r1); + matrix_begin[1] = _mm_unpackhi_pd(r0, r1); + } + template + XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept + { + transpose(reinterpret_cast*>(matrix_begin), reinterpret_cast*>(matrix_end), A {}); + } + template + XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept + { + transpose(reinterpret_cast*>(matrix_begin), reinterpret_cast*>(matrix_end), A {}); + } + + // zip_hi + template + XSIMD_INLINE batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_unpackhi_ps(self, other); + } + template ::value, void>::type> + XSIMD_INLINE batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm_unpackhi_epi8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm_unpackhi_epi16(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm_unpackhi_epi32(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm_unpackhi_epi64(self, other); + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + template + XSIMD_INLINE batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_unpackhi_pd(self, other); + } + + // zip_lo + template + XSIMD_INLINE batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_unpacklo_ps(self, other); + } + template ::value, void>::type> + XSIMD_INLINE batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm_unpacklo_epi8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm_unpacklo_epi16(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + return _mm_unpacklo_epi32(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) + { + return _mm_unpacklo_epi64(self, other); + } + else + { + assert(false && "unsupported arch/op combination"); + return {}; + } + } + template + XSIMD_INLINE batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept + { + return _mm_unpacklo_pd(self, other); + } +#endif + } +} + +#endif diff --git a/include/xsimd/arch/xsimd_isa.hpp b/include/xsimd/arch/xsimd_isa.hpp index f88d94f93..0428c8f7c 100644 --- a/include/xsimd/arch/xsimd_isa.hpp +++ b/include/xsimd/arch/xsimd_isa.hpp @@ -128,6 +128,10 @@ #include "./xsimd_wasm.hpp" #endif +#if XSIMD_WITH_ALTIVEC +#include "./xsimd_altivec.hpp" +#endif + // Must come last to have access to all conversion specializations. #include "./xsimd_common.hpp" diff --git a/include/xsimd/config/xsimd_config.hpp b/include/xsimd/config/xsimd_config.hpp index 326f766c4..98d603647 100644 --- a/include/xsimd/config/xsimd_config.hpp +++ b/include/xsimd/config/xsimd_config.hpp @@ -408,6 +408,17 @@ #define XSIMD_WITH_WASM 0 #endif +/** + * @ingroup xsimd_config_macro + * + * Set to 1 if Altivec is available at compile-time, to 0 otherwise. + */ +#ifdef __VEC__ +#define XSIMD_WITH_ALTIVEC 1 +#else +#define XSIMD_WITH_ALTIVEC 0 +#endif + // Workaround for MSVC compiler #ifdef _MSC_VER @@ -466,7 +477,7 @@ #endif -#if !XSIMD_WITH_SSE2 && !XSIMD_WITH_SSE3 && !XSIMD_WITH_SSSE3 && !XSIMD_WITH_SSE4_1 && !XSIMD_WITH_SSE4_2 && !XSIMD_WITH_AVX && !XSIMD_WITH_AVX2 && !XSIMD_WITH_AVXVNNI && !XSIMD_WITH_FMA3_SSE && !XSIMD_WITH_FMA4 && !XSIMD_WITH_FMA3_AVX && !XSIMD_WITH_FMA3_AVX2 && !XSIMD_WITH_AVX512F && !XSIMD_WITH_AVX512CD && !XSIMD_WITH_AVX512DQ && !XSIMD_WITH_AVX512BW && !XSIMD_WITH_AVX512ER && !XSIMD_WITH_AVX512PF && !XSIMD_WITH_AVX512IFMA && !XSIMD_WITH_AVX512VBMI && !XSIMD_WITH_AVX512VBMI2 && !XSIMD_WITH_NEON && !XSIMD_WITH_NEON64 && !XSIMD_WITH_SVE && !XSIMD_WITH_RVV && !XSIMD_WITH_WASM && !XSIMD_WITH_EMULATED +#if !XSIMD_WITH_SSE2 && !XSIMD_WITH_SSE3 && !XSIMD_WITH_SSSE3 && !XSIMD_WITH_SSE4_1 && !XSIMD_WITH_SSE4_2 && !XSIMD_WITH_AVX && !XSIMD_WITH_AVX2 && !XSIMD_WITH_AVXVNNI && !XSIMD_WITH_FMA3_SSE && !XSIMD_WITH_FMA4 && !XSIMD_WITH_FMA3_AVX && !XSIMD_WITH_FMA3_AVX2 && !XSIMD_WITH_AVX512F && !XSIMD_WITH_AVX512CD && !XSIMD_WITH_AVX512DQ && !XSIMD_WITH_AVX512BW && !XSIMD_WITH_AVX512ER && !XSIMD_WITH_AVX512PF && !XSIMD_WITH_AVX512IFMA && !XSIMD_WITH_AVX512VBMI && !XSIMD_WITH_AVX512VBMI2 && !XSIMD_WITH_NEON && !XSIMD_WITH_NEON64 && !XSIMD_WITH_SVE && !XSIMD_WITH_RVV && !XSIMD_WITH_WASM && !XSIMD_WITH_ALTIVEC && !XSIMD_WITH_EMULATED #define XSIMD_NO_SUPPORTED_ARCHITECTURE #endif diff --git a/include/xsimd/types/xsimd_all_registers.hpp b/include/xsimd/types/xsimd_all_registers.hpp index a652061a8..9d72454dd 100644 --- a/include/xsimd/types/xsimd_all_registers.hpp +++ b/include/xsimd/types/xsimd_all_registers.hpp @@ -48,6 +48,8 @@ #include "xsimd_wasm_register.hpp" +#include "xsimd_altivec_register.hpp" + #if XSIMD_WITH_EMULATED #include "xsimd_emulated_register.hpp" #endif diff --git a/include/xsimd/types/xsimd_altivec_register.hpp b/include/xsimd/types/xsimd_altivec_register.hpp new file mode 100644 index 000000000..0ec59ac17 --- /dev/null +++ b/include/xsimd/types/xsimd_altivec_register.hpp @@ -0,0 +1,57 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_ALTIVEC_REGISTER_HPP +#define XSIMD_ALTIVEC_REGISTER_HPP + +#include "./xsimd_common_arch.hpp" +#include "./xsimd_register.hpp" + +#if XSIMD_WITH_ALTIVEC +#include +#endif + +namespace xsimd +{ + /** + * @ingroup architectures + * + * Altivec instructions + */ + struct altivec : common + { + static constexpr bool supported() noexcept { return XSIMD_WITH_ALTIVEC; } + static constexpr bool available() noexcept { return true; } + static constexpr bool requires_alignment() noexcept { return true; } + static constexpr std::size_t alignment() noexcept { return 16; } + static constexpr char const* name() noexcept { return "altivec"; } + }; + +#if XSIMD_WITH_ALTIVEC + namespace types + { + XSIMD_DECLARE_SIMD_REGISTER(signed char, altivec, vector signed char); + XSIMD_DECLARE_SIMD_REGISTER(unsigned char, altivec, vector unsigned char); + XSIMD_DECLARE_SIMD_REGISTER(char, altivec, vecroe char); + XSIMD_DECLARE_SIMD_REGISTER(unsigned short, altivec, vector unsigned short); + XSIMD_DECLARE_SIMD_REGISTER(short, altivec, vector short); + XSIMD_DECLARE_SIMD_REGISTER(unsigned int, altivec, vector unsigned int); + XSIMD_DECLARE_SIMD_REGISTER(int, altivec, vector int); + XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, altivec, vector unsigned long); + XSIMD_DECLARE_SIMD_REGISTER(long int, altivec, vector long); + XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, altivec, vector unsigned long long); + XSIMD_DECLARE_SIMD_REGISTER(long long int, altivec, vector long long); + XSIMD_DECLARE_SIMD_REGISTER(float, altivec, vector float); + } +#endif +} + +#endif From a8da6516c4b1f1d24cfde056123d1385126fd673 Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Sun, 13 Jul 2025 13:28:47 +0200 Subject: [PATCH 02/57] minimal test --- test/CMakeLists.txt | 72 ++++++++++++++++++++++----------------------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 8a4ce50d5..bf26edcb4 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -127,42 +127,42 @@ endif() set(XSIMD_TESTS main.cpp - test_api.cpp - test_arch.cpp - test_basic_math.cpp - test_batch.cpp - test_batch_bool.cpp - test_batch_cast.cpp - test_batch_complex.cpp - test_batch_float.cpp - test_batch_int.cpp - test_bitwise_cast.cpp - test_batch_constant.cpp - test_batch_manip.cpp - test_complex_exponential.cpp - test_complex_hyperbolic.cpp - test_complex_power.cpp - test_complex_trigonometric.cpp - test_conversion.cpp - test_custom_default_arch.cpp - test_error_gamma.cpp - test_explicit_batch_instantiation.cpp - test_exponential.cpp - test_extract_pair.cpp - test_fp_manipulation.cpp - test_hyperbolic.cpp - test_load_store.cpp - test_memory.cpp - test_poly_evaluation.cpp - test_power.cpp - test_rounding.cpp - test_select.cpp - test_shuffle.cpp - test_sum.cpp - test_traits.cpp - test_trigonometric.cpp - test_xsimd_api.cpp - test_utils.hpp + # test_api.cpp + # test_arch.cpp + # test_basic_math.cpp + # test_batch.cpp + # test_batch_bool.cpp + # test_batch_cast.cpp + # test_batch_complex.cpp + # test_batch_float.cpp + # test_batch_int.cpp + # test_bitwise_cast.cpp + # test_batch_constant.cpp + # test_batch_manip.cpp + # test_complex_exponential.cpp + # test_complex_hyperbolic.cpp + # test_complex_power.cpp + # test_complex_trigonometric.cpp + # test_conversion.cpp + # test_custom_default_arch.cpp + # test_error_gamma.cpp + # test_explicit_batch_instantiation.cpp + # test_exponential.cpp + # test_extract_pair.cpp + # test_fp_manipulation.cpp + # test_hyperbolic.cpp + # test_load_store.cpp + # test_memory.cpp + # test_poly_evaluation.cpp + # test_power.cpp + # test_rounding.cpp + # test_select.cpp + # test_shuffle.cpp + # test_sum.cpp + # test_traits.cpp + # test_trigonometric.cpp + # test_xsimd_api.cpp + # test_utils.hpp ) if(NOT MSVC) From 16838236ee83f158ce0f70ff6365e83c3b1600e4 Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Sun, 13 Jul 2025 13:41:01 +0200 Subject: [PATCH 03/57] add ppc toolchain description --- .../toolchains/gcc-powerpc-linux-gnu.cmake | 5 +++ .../workflows/{cross.yml => cross-arm.yml} | 1 - .github/workflows/cross-ppc.yml | 43 +++++++++++++++++++ .../xsimd/types/xsimd_altivec_register.hpp | 20 ++++----- test/CMakeLists.txt | 4 +- 5 files changed, 59 insertions(+), 14 deletions(-) create mode 100644 .github/toolchains/gcc-powerpc-linux-gnu.cmake rename .github/workflows/{cross.yml => cross-arm.yml} (96%) create mode 100644 .github/workflows/cross-ppc.yml diff --git a/.github/toolchains/gcc-powerpc-linux-gnu.cmake b/.github/toolchains/gcc-powerpc-linux-gnu.cmake new file mode 100644 index 000000000..a318f6412 --- /dev/null +++ b/.github/toolchains/gcc-powerpc-linux-gnu.cmake @@ -0,0 +1,5 @@ +set(CMAKE_SYSTEM_PROCESSOR powerpc) +set(triple powerpc-linux-gnu) + +include(${CMAKE_CURRENT_LIST_DIR}/gcc.cmake) + diff --git a/.github/workflows/cross.yml b/.github/workflows/cross-arm.yml similarity index 96% rename from .github/workflows/cross.yml rename to .github/workflows/cross-arm.yml index e71096f02..071e85f25 100644 --- a/.github/workflows/cross.yml +++ b/.github/workflows/cross-arm.yml @@ -13,7 +13,6 @@ jobs: - { platform: 'arm', arch: 'armv7-a', dir: 'arm-linux-gnueabihf', flags: '-mfpu=neon', full: 'ON'} - { platform: 'arm', arch: 'armv7-a', dir: 'arm-linux-gnueabihf', flags: '-mfpu=vfpv3-d16', full: 'OFF' } # no neon - { platform: 'aarch64', arch: 'armv8-a', dir: 'aarch64-linux-gnu', flags: '', full: 'ON' } - - { platform: 'ppc', arch: 'powerpc', dir: 'powerpc-linux-gnu', flags: '-maltivec', full: 'OFF' } sys: - { compiler: 'gcc', version: '10' } steps: diff --git a/.github/workflows/cross-ppc.yml b/.github/workflows/cross-ppc.yml new file mode 100644 index 000000000..f63383a8f --- /dev/null +++ b/.github/workflows/cross-ppc.yml @@ -0,0 +1,43 @@ +name: PowerPC cross-compilation build +on: [push, pull_request] +concurrency: + group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }} + cancel-in-progress: true +jobs: + build: + runs-on: ubuntu-latest + name: '${{ matrix.target.arch }}, ${{ matrix.sys.compiler }} ${{ matrix.sys.version }}' + strategy: + matrix: + target: + - { platform: 'ppc', dir: 'powerpc-linux-gnu', flags: '-maltivec', full: 'OFF' } + sys: + - { compiler: 'gcc', version: '10' } + steps: + - name: Setup compiler + if: ${{ matrix.sys.compiler == 'gcc' }} + run: | + sudo apt-get update || exit 1 + sudo apt-get --no-install-suggests --no-install-recommends install g++-${{ matrix.sys.version }}-${{ matrix.target.dir }} g++-${{ matrix.sys.version }}-multilib || exit 1 + sudo update-alternatives --remove-all ${{ matrix.target.dir }}-gcc || true + sudo update-alternatives --remove-all ${{ matrix.target.dir }}-g++ || true + sudo update-alternatives --install /usr/bin/${{ matrix.target.dir }}-gcc ${{ matrix.target.dir }}-gcc /usr/bin/${{ matrix.target.dir }}-gcc-${{ matrix.sys.version }} 20 + sudo update-alternatives --install /usr/bin/${{ matrix.target.dir }}-g++ ${{ matrix.target.dir }}-g++ /usr/bin/${{ matrix.target.dir }}-g++-${{ matrix.sys.version }} 20 + - name: Setup QEMU + run: | + sudo apt-get --no-install-suggests --no-install-recommends install qemu-user + - name: Setup Ninja + run: | + sudo apt-get install ninja-build + - name: Checkout xsimd + uses: actions/checkout@v3 + - name: Setup + run: | + mkdir _build + cd _build && cmake .. -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DBUILD_BENCHMARK=${{ matrix.target.full }} -DBUILD_EXAMPLES=${{ matrix.target.full }} -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_FLAGS="${{ matrix.target.flags }}" -DCMAKE_CXX_FLAGS="${{ matrix.target.flags }}" -DCMAKE_TOOLCHAIN_FILE=${{ github.workspace }}/.github/toolchains/${{ matrix.sys.compiler }}-${{ matrix.target.dir }}.cmake + - name: Build + run: cmake --build _build --verbose + - name: Testing xsimd + run: | + qemu-${{ matrix.target.platform }} -cpu 7400 -L /usr/${{ matrix.target.dir}}/ ./test/test_xsimd + working-directory: ${{ github.workspace }}/_build diff --git a/include/xsimd/types/xsimd_altivec_register.hpp b/include/xsimd/types/xsimd_altivec_register.hpp index 0ec59ac17..52f896bf2 100644 --- a/include/xsimd/types/xsimd_altivec_register.hpp +++ b/include/xsimd/types/xsimd_altivec_register.hpp @@ -38,18 +38,14 @@ namespace xsimd #if XSIMD_WITH_ALTIVEC namespace types { - XSIMD_DECLARE_SIMD_REGISTER(signed char, altivec, vector signed char); - XSIMD_DECLARE_SIMD_REGISTER(unsigned char, altivec, vector unsigned char); - XSIMD_DECLARE_SIMD_REGISTER(char, altivec, vecroe char); - XSIMD_DECLARE_SIMD_REGISTER(unsigned short, altivec, vector unsigned short); - XSIMD_DECLARE_SIMD_REGISTER(short, altivec, vector short); - XSIMD_DECLARE_SIMD_REGISTER(unsigned int, altivec, vector unsigned int); - XSIMD_DECLARE_SIMD_REGISTER(int, altivec, vector int); - XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, altivec, vector unsigned long); - XSIMD_DECLARE_SIMD_REGISTER(long int, altivec, vector long); - XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, altivec, vector unsigned long long); - XSIMD_DECLARE_SIMD_REGISTER(long long int, altivec, vector long long); - XSIMD_DECLARE_SIMD_REGISTER(float, altivec, vector float); + XSIMD_DECLARE_SIMD_REGISTER(signed char, altivec, __vector signed char); + XSIMD_DECLARE_SIMD_REGISTER(unsigned char, altivec, __vector unsigned char); + XSIMD_DECLARE_SIMD_REGISTER(char, altivec, __vector char); + XSIMD_DECLARE_SIMD_REGISTER(unsigned short, altivec, __vector unsigned short); + XSIMD_DECLARE_SIMD_REGISTER(short, altivec, __vector short); + XSIMD_DECLARE_SIMD_REGISTER(unsigned int, altivec, __vector unsigned int); + XSIMD_DECLARE_SIMD_REGISTER(int, altivec, __vector int); + XSIMD_DECLARE_SIMD_REGISTER(float, altivec, __vector float); } #endif } diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index bf26edcb4..bdcab4b80 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -107,6 +107,8 @@ if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID MATCHES "GNU" set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mcpu=${TARGET_ARCH} -mtune=${TARGET_ARCH}") elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "riscv64") # Nothing specific + elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "powerpc") + # Nothing specific elseif(NOT WIN32 AND NOT EMSCRIPTEN) if(NOT CMAKE_CXX_FLAGS MATCHES "-march" AND NOT CMAKE_CXX_FLAGS MATCHES "-arch" AND NOT CMAKE_OSX_ARCHITECTURES) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=${TARGET_ARCH}") @@ -166,7 +168,7 @@ set(XSIMD_TESTS ) if(NOT MSVC) - list(APPEND XSIMD_TESTS test_gnu_source.cpp) + #list(APPEND XSIMD_TESTS test_gnu_source.cpp) endif() add_executable(test_xsimd ${XSIMD_TESTS} ${XSIMD_HEADERS}) From ddfad22678996fd60c5d2cdc43720951a8415cb4 Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Sun, 13 Jul 2025 15:26:11 +0200 Subject: [PATCH 04/57] + test_arch --- test/CMakeLists.txt | 74 ++++++++++++++++++++++----------------------- 1 file changed, 37 insertions(+), 37 deletions(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index bdcab4b80..e6bad7999 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -129,46 +129,46 @@ endif() set(XSIMD_TESTS main.cpp - # test_api.cpp - # test_arch.cpp - # test_basic_math.cpp - # test_batch.cpp - # test_batch_bool.cpp - # test_batch_cast.cpp - # test_batch_complex.cpp - # test_batch_float.cpp - # test_batch_int.cpp - # test_bitwise_cast.cpp - # test_batch_constant.cpp - # test_batch_manip.cpp - # test_complex_exponential.cpp - # test_complex_hyperbolic.cpp - # test_complex_power.cpp - # test_complex_trigonometric.cpp - # test_conversion.cpp - # test_custom_default_arch.cpp - # test_error_gamma.cpp - # test_explicit_batch_instantiation.cpp - # test_exponential.cpp - # test_extract_pair.cpp - # test_fp_manipulation.cpp - # test_hyperbolic.cpp - # test_load_store.cpp - # test_memory.cpp - # test_poly_evaluation.cpp - # test_power.cpp - # test_rounding.cpp - # test_select.cpp - # test_shuffle.cpp - # test_sum.cpp - # test_traits.cpp - # test_trigonometric.cpp - # test_xsimd_api.cpp - # test_utils.hpp + test_api.cpp + test_arch.cpp + test_basic_math.cpp + test_batch.cpp + test_batch_bool.cpp + test_batch_cast.cpp + test_batch_complex.cpp + test_batch_float.cpp + test_batch_int.cpp + test_bitwise_cast.cpp + test_batch_constant.cpp + test_batch_manip.cpp + test_complex_exponential.cpp + test_complex_hyperbolic.cpp + test_complex_power.cpp + test_complex_trigonometric.cpp + test_conversion.cpp + test_custom_default_arch.cpp + test_error_gamma.cpp + test_explicit_batch_instantiation.cpp + test_exponential.cpp + test_extract_pair.cpp + test_fp_manipulation.cpp + test_hyperbolic.cpp + test_load_store.cpp + test_memory.cpp + test_poly_evaluation.cpp + test_power.cpp + test_rounding.cpp + test_select.cpp + test_shuffle.cpp + test_sum.cpp + test_traits.cpp + test_trigonometric.cpp + test_xsimd_api.cpp + test_utils.hpp ) if(NOT MSVC) - #list(APPEND XSIMD_TESTS test_gnu_source.cpp) + list(APPEND XSIMD_TESTS test_gnu_source.cpp) endif() add_executable(test_xsimd ${XSIMD_TESTS} ${XSIMD_HEADERS}) From c586776142ae035a68ae47ca882f608b206b062b Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Sun, 13 Jul 2025 15:54:47 +0200 Subject: [PATCH 05/57] WIP --- include/xsimd/arch/xsimd_altivec.hpp | 211 ++++++--------------------- 1 file changed, 41 insertions(+), 170 deletions(-) diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp index 23206ad5d..bbe87192d 100644 --- a/include/xsimd/arch/xsimd_altivec.hpp +++ b/include/xsimd/arch/xsimd_altivec.hpp @@ -65,59 +65,23 @@ namespace xsimd XSIMD_INLINE batch avg(batch const&, batch const&, requires_arch) noexcept; template XSIMD_INLINE batch avgr(batch const&, batch const&, requires_arch) noexcept; +#endif // abs template - XSIMD_INLINE batch abs(batch const& self, requires_arch) noexcept - { - __m128d sign_mask = _mm_set1_pd(-0.f); // -0.f = 1 << 31 - return _mm_andnot_pd(sign_mask, self); - } - template - XSIMD_INLINE batch abs(batch const& self, requires_arch) noexcept + XSIMD_INLINE batch abs(batch const& self, requires_arch) noexcept { - __m128 sign_mask = _mm_set1_ps(-0.f); // -0.f = 1 << 31 - return _mm_andnot_ps(sign_mask, self); + return vec_abs(self); } // add - template ::value, void>::type> - XSIMD_INLINE batch add(batch const& self, batch const& other, requires_arch) noexcept + template ::value, void>::type> + XSIMD_INLINE batch add(batch const& self, batch const& other, requires_arch) noexcept { - XSIMD_IF_CONSTEXPR(sizeof(T) == 1) - { - return _mm_add_epi8(self, other); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) - { - return _mm_add_epi16(self, other); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) - { - return _mm_add_epi32(self, other); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) - { - return _mm_add_epi64(self, other); - } - else - { - assert(false && "unsupported arch/op combination"); - return {}; - } + return vec_add(self, other); } - template - XSIMD_INLINE batch add(batch const& self, batch const& other, requires_arch) noexcept - { - return _mm_add_ps(self, other); - } - - template - XSIMD_INLINE batch add(batch const& self, batch const& other, requires_arch) noexcept - { - return _mm_add_pd(self, other); - } +#if 0 // all template @@ -923,40 +887,22 @@ namespace xsimd return _mm_cmpunord_pd(self, self); } +#endif // load_aligned - template - XSIMD_INLINE batch load_aligned(float const* mem, convert, requires_arch) noexcept - { - return _mm_load_ps(mem); - } - template ::value, void>::type> - XSIMD_INLINE batch load_aligned(T const* mem, convert, requires_arch) noexcept + template ::value, void>::type> + XSIMD_INLINE batch load_aligned(T const* mem, convert, requires_arch) noexcept { - return _mm_load_si128((__m128i const*)mem); - } - template - XSIMD_INLINE batch load_aligned(double const* mem, convert, requires_arch) noexcept - { - return _mm_load_pd(mem); + return vec_ld(0, mem); } // load_unaligned - template - XSIMD_INLINE batch load_unaligned(float const* mem, convert, requires_arch) noexcept + template ::value, void>::type> + XSIMD_INLINE batch load_unaligned(T const* mem, convert, requires_arch) noexcept { - return _mm_loadu_ps(mem); - } - template ::value, void>::type> - XSIMD_INLINE batch load_unaligned(T const* mem, convert, requires_arch) noexcept - { - return _mm_loadu_si128((__m128i const*)mem); - } - template - XSIMD_INLINE batch load_unaligned(double const* mem, convert, requires_arch) noexcept - { - return _mm_loadu_pd(mem); + return *(typename batch::register_type)mem; } +#if 0 // load_complex namespace detail { @@ -972,6 +918,8 @@ namespace xsimd return { _mm_shuffle_pd(hi, lo, _MM_SHUFFLE2(0, 0)), _mm_shuffle_pd(hi, lo, _MM_SHUFFLE2(1, 1)) }; } } +#endif +#if 0 // le template @@ -1435,42 +1383,14 @@ namespace xsimd return _mm_srli_si128(x, N); } +#endif // sadd - - template ::value, void>::type> - XSIMD_INLINE batch sadd(batch const& self, batch const& other, requires_arch) noexcept + template ::value, void>::type> + XSIMD_INLINE batch sadd(batch const& self, batch const& other, requires_arch) noexcept { - if (std::is_signed::value) - { - XSIMD_IF_CONSTEXPR(sizeof(T) == 1) - { - return _mm_adds_epi8(self, other); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) - { - return _mm_adds_epi16(self, other); - } - else - { - return sadd(self, other, common {}); - } - } - else - { - XSIMD_IF_CONSTEXPR(sizeof(T) == 1) - { - return _mm_adds_epu8(self, other); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) - { - return _mm_adds_epu16(self, other); - } - else - { - return sadd(self, other, common {}); - } - } + return vec_adds(self, other); } +#if 0 // set template @@ -1527,88 +1447,39 @@ namespace xsimd static_assert(sizeof...(Values) == batch_bool::size, "consistent init"); return _mm_castsi128_pd(set(batch(), A {}, static_cast(values ? -1LL : 0LL)...).data); } +#endif // ssub - template ::value, void>::type> - XSIMD_INLINE batch ssub(batch const& self, batch const& other, requires_arch) noexcept + template ::value, void>::type> + XSIMD_INLINE batch ssub(batch const& self, batch const& other, requires_arch) noexcept { - if (std::is_signed::value) - { - XSIMD_IF_CONSTEXPR(sizeof(T) == 1) - { - return _mm_subs_epi8(self, other); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) - { - return _mm_subs_epi16(self, other); - } - else - { - return ssub(self, other, common {}); - } - } - else - { - XSIMD_IF_CONSTEXPR(sizeof(T) == 1) - { - return _mm_subs_epu8(self, other); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) - { - return _mm_subs_epu16(self, other); - } - else - { - return ssub(self, other, common {}); - } - } + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return vec_subs(self, other); + } + else + { + return ssub(self, other, common {}); + } } + // store_aligned - template - XSIMD_INLINE void store_aligned(float* mem, batch const& self, requires_arch) noexcept - { - return _mm_store_ps(mem, self); - } - template ::value, void>::type> - XSIMD_INLINE void store_aligned(T* mem, batch const& self, requires_arch) noexcept + template ::value, void>::type> + XSIMD_INLINE void store_aligned(T* mem, batch const& self, requires_arch) noexcept { - return _mm_store_si128((__m128i*)mem, self); - } - template ::value, void>::type> - XSIMD_INLINE void store_aligned(T* mem, batch_bool const& self, requires_arch) noexcept - { - return _mm_store_si128((__m128i*)mem, self); - } - template - XSIMD_INLINE void store_aligned(double* mem, batch const& self, requires_arch) noexcept - { - return _mm_store_pd(mem, self); + return vec_st(self, 0, mem); } // store_unaligned - template - XSIMD_INLINE void store_unaligned(float* mem, batch const& self, requires_arch) noexcept - { - return _mm_storeu_ps(mem, self); - } - template ::value, void>::type> - XSIMD_INLINE void store_unaligned(T* mem, batch const& self, requires_arch) noexcept + template ::value, void>::type> + XSIMD_INLINE void store_unaligned(T* mem, batch const& self, requires_arch) noexcept { - return _mm_storeu_si128((__m128i*)mem, self); - } - template ::value, void>::type> - XSIMD_INLINE void store_unaligned(T* mem, batch_bool const& self, requires_arch) noexcept - { - return _mm_storeu_si128((__m128i*)mem, self); - } - template - XSIMD_INLINE void store_unaligned(double* mem, batch const& self, requires_arch) noexcept - { - return _mm_storeu_pd(mem, self); + *(typename batch::register_type)mem = self; } +#if 0 // sub template XSIMD_INLINE batch sub(batch const& self, batch const& other, requires_arch) noexcept From 9c0bd33a365fd75366f089f37bba1390673c9015 Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Sun, 13 Jul 2025 16:16:51 +0200 Subject: [PATCH 06/57] WIP --- include/xsimd/arch/xsimd_altivec.hpp | 79 +++++++--------------------- include/xsimd/config/xsimd_arch.hpp | 4 +- include/xsimd/config/xsimd_cpuid.hpp | 5 ++ 3 files changed, 27 insertions(+), 61 deletions(-) diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp index bbe87192d..059eaa967 100644 --- a/include/xsimd/arch/xsimd_altivec.hpp +++ b/include/xsimd/arch/xsimd_altivec.hpp @@ -889,7 +889,7 @@ namespace xsimd #endif // load_aligned - template ::value, void>::type> + template ::value, void>::type> XSIMD_INLINE batch load_aligned(T const* mem, convert, requires_arch) noexcept { return vec_ld(0, mem); @@ -1115,25 +1115,15 @@ namespace xsimd { return _mm_min_pd(self, other); } +#endif // mul - template - XSIMD_INLINE batch mul(batch const& self, batch const& other, requires_arch) noexcept - { - return _mm_mul_ps(self, other); - } - template - XSIMD_INLINE batch mul(batch const& self, batch const& other, requires_arch) noexcept - { - return _mm_mul_pd(self, other); - } - - // mul - template - XSIMD_INLINE batch mul(batch const& self, batch const& other, requires_arch) noexcept + template ::value, void>::type> + XSIMD_INLINE batch mul(batch const& self, batch const& other, requires_arch) noexcept { - return _mm_mullo_epi16(self, other); + return vec_mul(self, other); } +#if 0 // nearbyint_as_int template @@ -1388,7 +1378,7 @@ namespace xsimd template ::value, void>::type> XSIMD_INLINE batch sadd(batch const& self, batch const& other, requires_arch) noexcept { - return vec_adds(self, other); + return vec_adds(self, other); } #if 0 @@ -1454,17 +1444,16 @@ namespace xsimd template ::value, void>::type> XSIMD_INLINE batch ssub(batch const& self, batch const& other, requires_arch) noexcept { - XSIMD_IF_CONSTEXPR(sizeof(T) == 1) - { - return vec_subs(self, other); - } - else - { - return ssub(self, other, common {}); - } + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return vec_subs(self, other); + } + else + { + return ssub(self, other, common {}); + } } - // store_aligned template ::value, void>::type> XSIMD_INLINE void store_aligned(T* mem, batch const& self, requires_arch) noexcept @@ -1479,44 +1468,14 @@ namespace xsimd *(typename batch::register_type)mem = self; } -#if 0 // sub - template - XSIMD_INLINE batch sub(batch const& self, batch const& other, requires_arch) noexcept - { - return _mm_sub_ps(self, other); - } - template ::value, void>::type> - XSIMD_INLINE batch sub(batch const& self, batch const& other, requires_arch) noexcept - { - XSIMD_IF_CONSTEXPR(sizeof(T) == 1) - { - return _mm_sub_epi8(self, other); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) - { - return _mm_sub_epi16(self, other); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) - { - return _mm_sub_epi32(self, other); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) - { - return _mm_sub_epi64(self, other); - } - else - { - assert(false && "unsupported arch/op combination"); - return {}; - } - } - template - XSIMD_INLINE batch sub(batch const& self, batch const& other, requires_arch) noexcept + template ::value, void>::type> + XSIMD_INLINE batch sub(batch const& self, batch const& other, requires_arch) noexcept { - return _mm_sub_pd(self, other); + return vec_sub(self, other); } +#if 0 // swizzle template diff --git a/include/xsimd/config/xsimd_arch.hpp b/include/xsimd/config/xsimd_arch.hpp index 89fc6783d..99a133ef0 100644 --- a/include/xsimd/config/xsimd_arch.hpp +++ b/include/xsimd/config/xsimd_arch.hpp @@ -169,14 +169,16 @@ namespace xsimd using all_sve_architectures = arch_list, detail::sve<256>, detail::sve<128>>; using all_rvv_architectures = arch_list, detail::rvv<256>, detail::rvv<128>>; using all_arm_architectures = typename detail::join, neon64, neon>>::type; + using all_power_architectures = arch_list; using all_riscv_architectures = all_rvv_architectures; using all_wasm_architectures = arch_list; - using all_architectures = typename detail::join::type; + using all_architectures = typename detail::join::type; using supported_architectures = typename detail::supported::type; using x86_arch = typename detail::supported::type::best; using arm_arch = typename detail::supported::type::best; + using power_arch = typename detail::supported::type::best; using riscv_arch = typename detail::supported::type::best; using best_arch = typename supported_architectures::best; diff --git a/include/xsimd/config/xsimd_cpuid.hpp b/include/xsimd/config/xsimd_cpuid.hpp index 7b940f655..fdd044f3d 100644 --- a/include/xsimd/config/xsimd_cpuid.hpp +++ b/include/xsimd/config/xsimd_cpuid.hpp @@ -84,6 +84,7 @@ namespace xsimd ARCH_FIELD_EX_REUSE(detail::rvv<256>, rvv) ARCH_FIELD_EX_REUSE(detail::rvv<128>, rvv) ARCH_FIELD(wasm) + ARCH_FIELD(altivec) #undef ARCH_FIELD @@ -95,6 +96,10 @@ namespace xsimd wasm = 1; #endif +#if XSIMD_WITH_ALTIVEC + altivec = 1; +#endif + #if defined(__aarch64__) || defined(_M_ARM64) neon = 1; neon64 = 1; From ad8177bd4dbfe4995009840498184b2f517f73ec Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Sun, 13 Jul 2025 16:47:39 +0200 Subject: [PATCH 07/57] WIP --- include/xsimd/arch/xsimd_altivec.hpp | 400 +++++++++++---------------- 1 file changed, 165 insertions(+), 235 deletions(-) diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp index 059eaa967..997fb5944 100644 --- a/include/xsimd/arch/xsimd_altivec.hpp +++ b/include/xsimd/arch/xsimd_altivec.hpp @@ -85,41 +85,41 @@ namespace xsimd // all template - XSIMD_INLINE bool all(batch_bool const& self, requires_arch) noexcept + XSIMD_INLINE bool all(batch_bool const& self, requires_arch) noexcept { return _mm_movemask_ps(self) == 0x0F; } template - XSIMD_INLINE bool all(batch_bool const& self, requires_arch) noexcept + XSIMD_INLINE bool all(batch_bool const& self, requires_arch) noexcept { return _mm_movemask_pd(self) == 0x03; } template ::value, void>::type> - XSIMD_INLINE bool all(batch_bool const& self, requires_arch) noexcept + XSIMD_INLINE bool all(batch_bool const& self, requires_arch) noexcept { return _mm_movemask_epi8(self) == 0xFFFF; } // any template - XSIMD_INLINE bool any(batch_bool const& self, requires_arch) noexcept + XSIMD_INLINE bool any(batch_bool const& self, requires_arch) noexcept { return _mm_movemask_ps(self) != 0; } template - XSIMD_INLINE bool any(batch_bool const& self, requires_arch) noexcept + XSIMD_INLINE bool any(batch_bool const& self, requires_arch) noexcept { return _mm_movemask_pd(self) != 0; } template ::value, void>::type> - XSIMD_INLINE bool any(batch_bool const& self, requires_arch) noexcept + XSIMD_INLINE bool any(batch_bool const& self, requires_arch) noexcept { return _mm_movemask_epi8(self) != 0; } // avgr template ::value, void>::type> - XSIMD_INLINE batch avgr(batch const& self, batch const& other, requires_arch) noexcept + XSIMD_INLINE batch avgr(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { @@ -137,7 +137,7 @@ namespace xsimd // avg template ::value, void>::type> - XSIMD_INLINE batch avg(batch const& self, batch const& other, requires_arch) noexcept + XSIMD_INLINE batch avg(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { @@ -154,86 +154,43 @@ namespace xsimd return avg(self, other, common {}); } } +#endif // batch_bool_cast template - XSIMD_INLINE batch_bool batch_bool_cast(batch_bool const& self, batch_bool const&, requires_arch) noexcept + XSIMD_INLINE batch_bool batch_bool_cast(batch_bool const& self, batch_bool const&, requires_arch) noexcept { return { bitwise_cast(batch(self.data)).data }; } // bitwise_and - template - XSIMD_INLINE batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept - { - return _mm_and_ps(self, other); - } - template - XSIMD_INLINE batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) noexcept - { - return _mm_and_ps(self, other); - } - template ::value, void>::type> - XSIMD_INLINE batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept - { - return _mm_and_si128(self, other); - } - template ::value, void>::type> - XSIMD_INLINE batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) noexcept - { - return _mm_and_si128(self, other); - } - - template - batch XSIMD_INLINE bitwise_and(batch const& self, batch const& other, requires_arch) noexcept + template ::value, void>::type> + XSIMD_INLINE batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept { - return _mm_and_pd(self, other); + return vec_and(self, other); } - - template - XSIMD_INLINE batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + template ::value, void>::type> + XSIMD_INLINE batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { - return _mm_and_pd(self, other); + return vec_and(self, other); } // bitwise_andnot - template - XSIMD_INLINE batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept - { - return _mm_andnot_ps(other, self); - } - - template - XSIMD_INLINE batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) noexcept - { - return _mm_andnot_ps(other, self); - } template ::value, void>::type> - XSIMD_INLINE batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept + XSIMD_INLINE batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept { - return _mm_andnot_si128(other, self); + return vec_andc(other, self); } template ::value, void>::type> - XSIMD_INLINE batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) noexcept - { - return _mm_andnot_si128(other, self); - } - - template - XSIMD_INLINE batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept + XSIMD_INLINE batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { - return _mm_andnot_pd(other, self); - } - - template - XSIMD_INLINE batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) noexcept - { - return _mm_andnot_pd(other, self); + return vec_andc(other, self); } +#if 0 // bitwise_lshift template ::value, void>::type> - XSIMD_INLINE batch bitwise_lshift(batch const& self, int32_t other, requires_arch) noexcept + XSIMD_INLINE batch bitwise_lshift(batch const& self, int32_t other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { @@ -260,73 +217,73 @@ namespace xsimd // bitwise_not template - XSIMD_INLINE batch bitwise_not(batch const& self, requires_arch) noexcept + XSIMD_INLINE batch bitwise_not(batch const& self, requires_arch) noexcept { return _mm_xor_ps(self, _mm_castsi128_ps(_mm_set1_epi32(-1))); } template - XSIMD_INLINE batch_bool bitwise_not(batch_bool const& self, requires_arch) noexcept + XSIMD_INLINE batch_bool bitwise_not(batch_bool const& self, requires_arch) noexcept { return _mm_xor_ps(self, _mm_castsi128_ps(_mm_set1_epi32(-1))); } template ::value, void>::type> - XSIMD_INLINE batch bitwise_not(batch const& self, requires_arch) noexcept + XSIMD_INLINE batch bitwise_not(batch const& self, requires_arch) noexcept { return _mm_xor_si128(self, _mm_set1_epi32(-1)); } template ::value, void>::type> - XSIMD_INLINE batch_bool bitwise_not(batch_bool const& self, requires_arch) noexcept + XSIMD_INLINE batch_bool bitwise_not(batch_bool const& self, requires_arch) noexcept { return _mm_xor_si128(self, _mm_set1_epi32(-1)); } template - XSIMD_INLINE batch bitwise_not(batch const& self, requires_arch) noexcept + XSIMD_INLINE batch bitwise_not(batch const& self, requires_arch) noexcept { return _mm_xor_pd(self, _mm_castsi128_pd(_mm_set1_epi32(-1))); } template - XSIMD_INLINE batch_bool bitwise_not(batch_bool const& self, requires_arch) noexcept + XSIMD_INLINE batch_bool bitwise_not(batch_bool const& self, requires_arch) noexcept { return _mm_xor_pd(self, _mm_castsi128_pd(_mm_set1_epi32(-1))); } // bitwise_or template - XSIMD_INLINE batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept + XSIMD_INLINE batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept { return _mm_or_ps(self, other); } template - XSIMD_INLINE batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + XSIMD_INLINE batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm_or_ps(self, other); } template ::value, void>::type> - XSIMD_INLINE batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept + XSIMD_INLINE batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept { return _mm_or_si128(self, other); } template ::value, void>::type> - XSIMD_INLINE batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + XSIMD_INLINE batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm_or_si128(self, other); } template - XSIMD_INLINE batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept + XSIMD_INLINE batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept { return _mm_or_pd(self, other); } template - XSIMD_INLINE batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + XSIMD_INLINE batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm_or_pd(self, other); } // bitwise_rshift template ::value, void>::type> - XSIMD_INLINE batch bitwise_rshift(batch const& self, int32_t other, requires_arch) noexcept + XSIMD_INLINE batch bitwise_rshift(batch const& self, int32_t other, requires_arch) noexcept { if (std::is_signed::value) { @@ -388,81 +345,54 @@ namespace xsimd // bitwise_xor template - XSIMD_INLINE batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept + XSIMD_INLINE batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept { return _mm_xor_ps(self, other); } template - XSIMD_INLINE batch_bool bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + XSIMD_INLINE batch_bool bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm_xor_ps(self, other); } template ::value, void>::type> - XSIMD_INLINE batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept + XSIMD_INLINE batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept { return _mm_xor_si128(self, other); } template - XSIMD_INLINE batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept + XSIMD_INLINE batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept { return _mm_xor_pd(self, other); } template - XSIMD_INLINE batch_bool bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + XSIMD_INLINE batch_bool bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm_xor_pd(self, other); } template ::value, void>::type> - XSIMD_INLINE batch bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + XSIMD_INLINE batch bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm_xor_si128(self, other); } +#endif // bitwise_cast - template ::value, void>::type> - XSIMD_INLINE batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept - { - return _mm_castsi128_ps(self); - } - template ::type>::value, void>::type> - XSIMD_INLINE batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept - { - return batch(self.data); - } - template ::value, void>::type> - XSIMD_INLINE batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept + template + XSIMD_INLINE batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept { - return _mm_castps_si128(self); - } - template ::value, void>::type> - XSIMD_INLINE batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept - { - return _mm_castsi128_pd(self); - } - template - XSIMD_INLINE batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept - { - return _mm_castps_pd(self); - } - template - XSIMD_INLINE batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept - { - return _mm_castpd_ps(self); - } - template ::value, void>::type> - XSIMD_INLINE batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept - { - return _mm_castpd_si128(self); + return *reinterpret_cast::register_type const*>(&self.data); } +#if 0 + // broadcast template - batch XSIMD_INLINE broadcast(float val, requires_arch) noexcept + batch XSIMD_INLINE broadcast(float val, requires_arch) noexcept { return _mm_set1_ps(val); } template ::value, void>::type> - XSIMD_INLINE batch broadcast(T val, requires_arch) noexcept + XSIMD_INLINE batch broadcast(T val, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { @@ -487,7 +417,7 @@ namespace xsimd } } template - XSIMD_INLINE batch broadcast(double val, requires_arch) noexcept + XSIMD_INLINE batch broadcast(double val, requires_arch) noexcept { return _mm_set1_pd(val); } @@ -498,23 +428,23 @@ namespace xsimd // Override these methods in SSE-based archs, no need to override store_aligned / store_unaligned // complex_low template - XSIMD_INLINE batch complex_low(batch, A> const& self, requires_arch) noexcept + XSIMD_INLINE batch complex_low(batch, A> const& self, requires_arch) noexcept { return _mm_unpacklo_ps(self.real(), self.imag()); } // complex_high template - XSIMD_INLINE batch complex_high(batch, A> const& self, requires_arch) noexcept + XSIMD_INLINE batch complex_high(batch, A> const& self, requires_arch) noexcept { return _mm_unpackhi_ps(self.real(), self.imag()); } template - XSIMD_INLINE batch complex_low(batch, A> const& self, requires_arch) noexcept + XSIMD_INLINE batch complex_low(batch, A> const& self, requires_arch) noexcept { return _mm_unpacklo_pd(self.real(), self.imag()); } template - XSIMD_INLINE batch complex_high(batch, A> const& self, requires_arch) noexcept + XSIMD_INLINE batch complex_high(batch, A> const& self, requires_arch) noexcept { return _mm_unpackhi_pd(self.real(), self.imag()); } @@ -522,19 +452,19 @@ namespace xsimd // decr_if template ::value, void>::type> - XSIMD_INLINE batch decr_if(batch const& self, batch_bool const& mask, requires_arch) noexcept + XSIMD_INLINE batch decr_if(batch const& self, batch_bool const& mask, requires_arch) noexcept { return self + batch(mask.data); } // div template - XSIMD_INLINE batch div(batch const& self, batch const& other, requires_arch) noexcept + XSIMD_INLINE batch div(batch const& self, batch const& other, requires_arch) noexcept { return _mm_div_ps(self, other); } template - XSIMD_INLINE batch div(batch const& self, batch const& other, requires_arch) noexcept + XSIMD_INLINE batch div(batch const& self, batch const& other, requires_arch) noexcept { return _mm_div_pd(self, other); } @@ -543,16 +473,16 @@ namespace xsimd namespace detail { template - XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch) noexcept + XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm_cvtepi32_ps(self); } template - XSIMD_INLINE batch fast_cast(batch const& x, batch const&, requires_arch) noexcept + XSIMD_INLINE batch fast_cast(batch const& x, batch const&, requires_arch) noexcept { // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx - // adapted to sse2 + // adapted to altivec __m128i xH = _mm_srli_epi64(x, 32); xH = _mm_or_si128(xH, _mm_castpd_si128(_mm_set1_pd(19342813113834066795298816.))); // 2^84 __m128i mask = _mm_setr_epi16(0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000); @@ -562,10 +492,10 @@ namespace xsimd } template - XSIMD_INLINE batch fast_cast(batch const& x, batch const&, requires_arch) noexcept + XSIMD_INLINE batch fast_cast(batch const& x, batch const&, requires_arch) noexcept { // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx - // adapted to sse2 + // adapted to altivec __m128i xH = _mm_srai_epi32(x, 16); xH = _mm_and_si128(xH, _mm_setr_epi16(0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF)); xH = _mm_add_epi64(xH, _mm_castpd_si128(_mm_set1_pd(442721857769029238784.))); // 3*2^67 @@ -576,7 +506,7 @@ namespace xsimd } template - XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch) noexcept + XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { return _mm_cvttps_epi32(self); } @@ -584,17 +514,17 @@ namespace xsimd // eq template - XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept + XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept { return _mm_cmpeq_ps(self, other); } template - XSIMD_INLINE batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + XSIMD_INLINE batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(self), _mm_castps_si128(other))); } template ::value, void>::type> - XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept + XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { @@ -623,36 +553,36 @@ namespace xsimd } } template ::value, void>::type> - XSIMD_INLINE batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + XSIMD_INLINE batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return ~(self != other); } template - XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept + XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept { return _mm_cmpeq_pd(self, other); } template - XSIMD_INLINE batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + XSIMD_INLINE batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm_castsi128_pd(_mm_cmpeq_epi32(_mm_castpd_si128(self), _mm_castpd_si128(other))); } // first template - XSIMD_INLINE float first(batch const& self, requires_arch) noexcept + XSIMD_INLINE float first(batch const& self, requires_arch) noexcept { return _mm_cvtss_f32(self); } template - XSIMD_INLINE double first(batch const& self, requires_arch) noexcept + XSIMD_INLINE double first(batch const& self, requires_arch) noexcept { return _mm_cvtsd_f64(self); } template ::value, void>::type> - XSIMD_INLINE T first(batch const& self, requires_arch) noexcept + XSIMD_INLINE T first(batch const& self, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { @@ -687,7 +617,7 @@ namespace xsimd // from_mask template - XSIMD_INLINE batch_bool from_mask(batch_bool const&, uint64_t mask, requires_arch) noexcept + XSIMD_INLINE batch_bool from_mask(batch_bool const&, uint64_t mask, requires_arch) noexcept { alignas(A::alignment()) static const uint32_t lut[][4] = { { 0x00000000, 0x00000000, 0x00000000, 0x00000000 }, @@ -711,7 +641,7 @@ namespace xsimd return _mm_castsi128_ps(_mm_load_si128((const __m128i*)lut[mask])); } template - XSIMD_INLINE batch_bool from_mask(batch_bool const&, uint64_t mask, requires_arch) noexcept + XSIMD_INLINE batch_bool from_mask(batch_bool const&, uint64_t mask, requires_arch) noexcept { alignas(A::alignment()) static const uint64_t lut[][4] = { { 0x0000000000000000ul, 0x0000000000000000ul }, @@ -723,7 +653,7 @@ namespace xsimd return _mm_castsi128_pd(_mm_load_si128((const __m128i*)lut[mask])); } template ::value, void>::type> - XSIMD_INLINE batch_bool from_mask(batch_bool const&, uint64_t mask, requires_arch) noexcept + XSIMD_INLINE batch_bool from_mask(batch_bool const&, uint64_t mask, requires_arch) noexcept { alignas(A::alignment()) static const uint64_t lut64[] = { 0x0000000000000000, @@ -773,34 +703,34 @@ namespace xsimd } else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { - return _mm_castps_si128(from_mask(batch_bool {}, mask, sse2 {})); + return _mm_castps_si128(from_mask(batch_bool {}, mask, altivec {})); } else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) { - return _mm_castpd_si128(from_mask(batch_bool {}, mask, sse2 {})); + return _mm_castpd_si128(from_mask(batch_bool {}, mask, altivec {})); } } // ge template - XSIMD_INLINE batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept + XSIMD_INLINE batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept { return _mm_cmpge_ps(self, other); } template - XSIMD_INLINE batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept + XSIMD_INLINE batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept { return _mm_cmpge_pd(self, other); } // gt template - XSIMD_INLINE batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept + XSIMD_INLINE batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept { return _mm_cmpgt_ps(self, other); } template ::value, void>::type> - XSIMD_INLINE batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept + XSIMD_INLINE batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { @@ -828,14 +758,14 @@ namespace xsimd } template - XSIMD_INLINE batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept + XSIMD_INLINE batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept { return _mm_cmpgt_pd(self, other); } // haddp template - XSIMD_INLINE batch haddp(batch const* row, requires_arch) noexcept + XSIMD_INLINE batch haddp(batch const* row, requires_arch) noexcept { __m128 tmp0 = _mm_unpacklo_ps(row[0], row[1]); __m128 tmp1 = _mm_unpackhi_ps(row[0], row[1]); @@ -848,7 +778,7 @@ namespace xsimd return _mm_add_ps(tmp0, tmp2); } template - XSIMD_INLINE batch haddp(batch const* row, requires_arch) noexcept + XSIMD_INLINE batch haddp(batch const* row, requires_arch) noexcept { return _mm_add_pd(_mm_unpacklo_pd(row[0], row[1]), _mm_unpackhi_pd(row[0], row[1])); @@ -856,14 +786,14 @@ namespace xsimd // incr_if template ::value, void>::type> - XSIMD_INLINE batch incr_if(batch const& self, batch_bool const& mask, requires_arch) noexcept + XSIMD_INLINE batch incr_if(batch const& self, batch_bool const& mask, requires_arch) noexcept { return self - batch(mask.data); } // insert template ::value, void>::type> - XSIMD_INLINE batch insert(batch const& self, T val, index pos, requires_arch) noexcept + XSIMD_INLINE batch insert(batch const& self, T val, index pos, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 2) { @@ -877,12 +807,12 @@ namespace xsimd // isnan template - XSIMD_INLINE batch_bool isnan(batch const& self, requires_arch) noexcept + XSIMD_INLINE batch_bool isnan(batch const& self, requires_arch) noexcept { return _mm_cmpunord_ps(self, self); } template - XSIMD_INLINE batch_bool isnan(batch const& self, requires_arch) noexcept + XSIMD_INLINE batch_bool isnan(batch const& self, requires_arch) noexcept { return _mm_cmpunord_pd(self, self); } @@ -908,12 +838,12 @@ namespace xsimd { // Redefine these methods in the SSE-based archs if required template - XSIMD_INLINE batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) noexcept + XSIMD_INLINE batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) noexcept { return { _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(2, 0, 2, 0)), _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(3, 1, 3, 1)) }; } template - XSIMD_INLINE batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) noexcept + XSIMD_INLINE batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) noexcept { return { _mm_shuffle_pd(hi, lo, _MM_SHUFFLE2(0, 0)), _mm_shuffle_pd(hi, lo, _MM_SHUFFLE2(1, 1)) }; } @@ -923,24 +853,24 @@ namespace xsimd // le template - XSIMD_INLINE batch_bool le(batch const& self, batch const& other, requires_arch) noexcept + XSIMD_INLINE batch_bool le(batch const& self, batch const& other, requires_arch) noexcept { return _mm_cmple_ps(self, other); } template - XSIMD_INLINE batch_bool le(batch const& self, batch const& other, requires_arch) noexcept + XSIMD_INLINE batch_bool le(batch const& self, batch const& other, requires_arch) noexcept { return _mm_cmple_pd(self, other); } // lt template - XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept + XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept { return _mm_cmplt_ps(self, other); } template ::value, void>::type> - XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept + XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept { if (std::is_signed::value) { @@ -1007,7 +937,7 @@ namespace xsimd } template - XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept + XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept { return _mm_cmplt_pd(self, other); } @@ -1045,7 +975,7 @@ namespace xsimd // mask template ::value, void>::type> - XSIMD_INLINE uint64_t mask(batch_bool const& self, requires_arch) noexcept + XSIMD_INLINE uint64_t mask(batch_bool const& self, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { @@ -1071,47 +1001,47 @@ namespace xsimd } } template - XSIMD_INLINE uint64_t mask(batch_bool const& self, requires_arch) noexcept + XSIMD_INLINE uint64_t mask(batch_bool const& self, requires_arch) noexcept { return _mm_movemask_ps(self); } template - XSIMD_INLINE uint64_t mask(batch_bool const& self, requires_arch) noexcept + XSIMD_INLINE uint64_t mask(batch_bool const& self, requires_arch) noexcept { return _mm_movemask_pd(self); } // max template - XSIMD_INLINE batch max(batch const& self, batch const& other, requires_arch) noexcept + XSIMD_INLINE batch max(batch const& self, batch const& other, requires_arch) noexcept { return _mm_max_ps(self, other); } template ::value, void>::type> - XSIMD_INLINE batch max(batch const& self, batch const& other, requires_arch) noexcept + XSIMD_INLINE batch max(batch const& self, batch const& other, requires_arch) noexcept { return select(self > other, self, other); } template - XSIMD_INLINE batch max(batch const& self, batch const& other, requires_arch) noexcept + XSIMD_INLINE batch max(batch const& self, batch const& other, requires_arch) noexcept { return _mm_max_pd(self, other); } // min template - XSIMD_INLINE batch min(batch const& self, batch const& other, requires_arch) noexcept + XSIMD_INLINE batch min(batch const& self, batch const& other, requires_arch) noexcept { return _mm_min_ps(self, other); } template ::value, void>::type> - XSIMD_INLINE batch min(batch const& self, batch const& other, requires_arch) noexcept + XSIMD_INLINE batch min(batch const& self, batch const& other, requires_arch) noexcept { return select(self <= other, self, other); } template - XSIMD_INLINE batch min(batch const& self, batch const& other, requires_arch) noexcept + XSIMD_INLINE batch min(batch const& self, batch const& other, requires_arch) noexcept { return _mm_min_pd(self, other); } @@ -1128,25 +1058,25 @@ namespace xsimd // nearbyint_as_int template XSIMD_INLINE batch nearbyint_as_int(batch const& self, - requires_arch) noexcept + requires_arch) noexcept { return _mm_cvtps_epi32(self); } // neg template ::value, void>::type> - XSIMD_INLINE batch neg(batch const& self, requires_arch) noexcept + XSIMD_INLINE batch neg(batch const& self, requires_arch) noexcept { return 0 - self; } template - XSIMD_INLINE batch neg(batch const& self, requires_arch) noexcept + XSIMD_INLINE batch neg(batch const& self, requires_arch) noexcept { return _mm_xor_ps(self, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))); } template - XSIMD_INLINE batch neg(batch const& self, requires_arch) noexcept + XSIMD_INLINE batch neg(batch const& self, requires_arch) noexcept { return _mm_xor_pd( self, _mm_castsi128_pd(_mm_setr_epi32(0, 0x80000000, 0, 0x80000000))); @@ -1154,33 +1084,33 @@ namespace xsimd // neq template - XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept + XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept { return _mm_cmpneq_ps(self, other); } template ::value, void>::type> - XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept + XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept { return ~(self == other); } template - XSIMD_INLINE batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + XSIMD_INLINE batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm_xor_ps(self, other); } template ::value, void>::type> - XSIMD_INLINE batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + XSIMD_INLINE batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(self.data), _mm_castsi128_ps(other.data))); } template - XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept + XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept { return _mm_cmpneq_pd(self, other); } template - XSIMD_INLINE batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + XSIMD_INLINE batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return _mm_xor_pd(self, other); } @@ -1188,14 +1118,14 @@ namespace xsimd // reciprocal template XSIMD_INLINE batch reciprocal(batch const& self, - kernel::requires_arch) + kernel::requires_arch) { return _mm_rcp_ps(self); } // reduce_add template - XSIMD_INLINE float reduce_add(batch const& self, requires_arch) noexcept + XSIMD_INLINE float reduce_add(batch const& self, requires_arch) noexcept { __m128 tmp0 = _mm_add_ps(self, _mm_movehl_ps(self, self)); __m128 tmp1 = _mm_add_ss(tmp0, _mm_shuffle_ps(tmp0, tmp0, 1)); @@ -1203,7 +1133,7 @@ namespace xsimd } template ::value, void>::type> - XSIMD_INLINE T reduce_add(batch const& self, requires_arch) noexcept + XSIMD_INLINE T reduce_add(batch const& self, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { @@ -1234,14 +1164,14 @@ namespace xsimd } template - XSIMD_INLINE double reduce_add(batch const& self, requires_arch) noexcept + XSIMD_INLINE double reduce_add(batch const& self, requires_arch) noexcept { return _mm_cvtsd_f64(_mm_add_sd(self, _mm_unpackhi_pd(self, self))); } // reduce_max template ::type> - XSIMD_INLINE T reduce_max(batch const& self, requires_arch) noexcept + XSIMD_INLINE T reduce_max(batch const& self, requires_arch) noexcept { constexpr auto mask0 = detail::shuffle(2, 3, 0, 0); batch step0 = _mm_shuffle_epi32(self, mask0); @@ -1263,7 +1193,7 @@ namespace xsimd // reduce_min template ::type> - XSIMD_INLINE T reduce_min(batch const& self, requires_arch) noexcept + XSIMD_INLINE T reduce_min(batch const& self, requires_arch) noexcept { constexpr auto mask0 = detail::shuffle(2, 3, 0, 0); batch step0 = _mm_shuffle_epi32(self, mask0); @@ -1285,42 +1215,42 @@ namespace xsimd // rsqrt template - XSIMD_INLINE batch rsqrt(batch const& val, requires_arch) noexcept + XSIMD_INLINE batch rsqrt(batch const& val, requires_arch) noexcept { return _mm_rsqrt_ps(val); } template - XSIMD_INLINE batch rsqrt(batch const& val, requires_arch) noexcept + XSIMD_INLINE batch rsqrt(batch const& val, requires_arch) noexcept { return _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(val))); } // select template - XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept + XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept { return _mm_or_ps(_mm_and_ps(cond, true_br), _mm_andnot_ps(cond, false_br)); } template ::value, void>::type> - XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept + XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept { return _mm_or_si128(_mm_and_si128(cond, true_br), _mm_andnot_si128(cond, false_br)); } template ::value, void>::type> - XSIMD_INLINE batch select(batch_bool_constant const&, batch const& true_br, batch const& false_br, requires_arch) noexcept + XSIMD_INLINE batch select(batch_bool_constant const&, batch const& true_br, batch const& false_br, requires_arch) noexcept { - return select(batch_bool { Values... }, true_br, false_br, sse2 {}); + return select(batch_bool { Values... }, true_br, false_br, altivec {}); } template - XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept + XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept { return _mm_or_pd(_mm_and_pd(cond, true_br), _mm_andnot_pd(cond, false_br)); } // shuffle template - XSIMD_INLINE batch shuffle(batch const& x, batch const& y, batch_constant mask, requires_arch) noexcept + XSIMD_INLINE batch shuffle(batch const& x, batch const& y, batch_constant mask, requires_arch) noexcept { constexpr uint32_t smask = detail::mod_shuffle(I0, I1, I2, I3); // shuffle within lane @@ -1334,7 +1264,7 @@ namespace xsimd } template - XSIMD_INLINE batch shuffle(batch const& x, batch const& y, batch_constant mask, requires_arch) noexcept + XSIMD_INLINE batch shuffle(batch const& x, batch const& y, batch_constant mask, requires_arch) noexcept { constexpr uint32_t smask = detail::mod_shuffle(I0, I1); // shuffle within lane @@ -1349,26 +1279,26 @@ namespace xsimd // sqrt template - XSIMD_INLINE batch sqrt(batch const& val, requires_arch) noexcept + XSIMD_INLINE batch sqrt(batch const& val, requires_arch) noexcept { return _mm_sqrt_ps(val); } template - XSIMD_INLINE batch sqrt(batch const& val, requires_arch) noexcept + XSIMD_INLINE batch sqrt(batch const& val, requires_arch) noexcept { return _mm_sqrt_pd(val); } // slide_left template - XSIMD_INLINE batch slide_left(batch const& x, requires_arch) noexcept + XSIMD_INLINE batch slide_left(batch const& x, requires_arch) noexcept { return _mm_slli_si128(x, N); } // slide_right template - XSIMD_INLINE batch slide_right(batch const& x, requires_arch) noexcept + XSIMD_INLINE batch slide_right(batch const& x, requires_arch) noexcept { return _mm_srli_si128(x, N); } @@ -1384,55 +1314,55 @@ namespace xsimd // set template - XSIMD_INLINE batch set(batch const&, requires_arch, Values... values) noexcept + XSIMD_INLINE batch set(batch const&, requires_arch, Values... values) noexcept { static_assert(sizeof...(Values) == batch::size, "consistent init"); return _mm_setr_ps(values...); } template ::value, void>::type> - XSIMD_INLINE batch set(batch const&, requires_arch, T v0, T v1) noexcept + XSIMD_INLINE batch set(batch const&, requires_arch, T v0, T v1) noexcept { return _mm_set_epi64x(v1, v0); } template ::value, void>::type> - XSIMD_INLINE batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3) noexcept + XSIMD_INLINE batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3) noexcept { return _mm_setr_epi32(v0, v1, v2, v3); } template ::value, void>::type> - XSIMD_INLINE batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) noexcept + XSIMD_INLINE batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) noexcept { return _mm_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7); } template ::value, void>::type> - XSIMD_INLINE batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) noexcept + XSIMD_INLINE batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) noexcept { return _mm_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15); } template - XSIMD_INLINE batch set(batch const&, requires_arch, Values... values) noexcept + XSIMD_INLINE batch set(batch const&, requires_arch, Values... values) noexcept { static_assert(sizeof...(Values) == batch::size, "consistent init"); return _mm_setr_pd(values...); } template ::value, void>::type> - XSIMD_INLINE batch_bool set(batch_bool const&, requires_arch, Values... values) noexcept + XSIMD_INLINE batch_bool set(batch_bool const&, requires_arch, Values... values) noexcept { return set(batch(), A {}, static_cast(values ? -1LL : 0LL)...).data; } template - XSIMD_INLINE batch_bool set(batch_bool const&, requires_arch, Values... values) noexcept + XSIMD_INLINE batch_bool set(batch_bool const&, requires_arch, Values... values) noexcept { static_assert(sizeof...(Values) == batch_bool::size, "consistent init"); return _mm_castsi128_ps(set(batch(), A {}, static_cast(values ? -1LL : 0LL)...).data); } template - XSIMD_INLINE batch_bool set(batch_bool const&, requires_arch, Values... values) noexcept + XSIMD_INLINE batch_bool set(batch_bool const&, requires_arch, Values... values) noexcept { static_assert(sizeof...(Values) == batch_bool::size, "consistent init"); return _mm_castsi128_pd(set(batch(), A {}, static_cast(values ? -1LL : 0LL)...).data); @@ -1479,47 +1409,47 @@ namespace xsimd // swizzle template - XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept + XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept { constexpr uint32_t index = detail::shuffle(V0, V1, V2, V3); return _mm_shuffle_ps(self, self, index); } template - XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept + XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept { constexpr uint32_t index = detail::shuffle(V0, V1); return _mm_shuffle_pd(self, self, index); } template - XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept + XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept { constexpr uint32_t index = detail::shuffle(2 * V0, 2 * V0 + 1, 2 * V1, 2 * V1 + 1); return _mm_shuffle_epi32(self, index); } template - XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept + XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { - return bitwise_cast(swizzle(bitwise_cast(self), mask, sse2 {})); + return bitwise_cast(swizzle(bitwise_cast(self), mask, altivec {})); } template - XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept + XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept { constexpr uint32_t index = detail::shuffle(V0, V1, V2, V3); return _mm_shuffle_epi32(self, index); } template - XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept + XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { - return bitwise_cast(swizzle(bitwise_cast(self), mask, sse2 {})); + return bitwise_cast(swizzle(bitwise_cast(self), mask, altivec {})); } template - XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept + XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept { // permute within each lane constexpr auto mask_lo = detail::mod_shuffle(V0, V1, V2, V3); @@ -1538,14 +1468,14 @@ namespace xsimd } template - XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept + XSIMD_INLINE batch swizzle(batch const& self, batch_constant mask, requires_arch) noexcept { - return bitwise_cast(swizzle(bitwise_cast(self), mask, sse2 {})); + return bitwise_cast(swizzle(bitwise_cast(self), mask, altivec {})); } // transpose template - XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept + XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); (void)matrix_end; @@ -1557,18 +1487,18 @@ namespace xsimd matrix_begin[3] = r3; } template - XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept + XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { transpose(reinterpret_cast*>(matrix_begin), reinterpret_cast*>(matrix_end), A {}); } template - XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept + XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { transpose(reinterpret_cast*>(matrix_begin), reinterpret_cast*>(matrix_end), A {}); } template - XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept + XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { assert((matrix_end - matrix_begin == batch::size) && "correctly sized matrix"); (void)matrix_end; @@ -1577,24 +1507,24 @@ namespace xsimd matrix_begin[1] = _mm_unpackhi_pd(r0, r1); } template - XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept + XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { transpose(reinterpret_cast*>(matrix_begin), reinterpret_cast*>(matrix_end), A {}); } template - XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept + XSIMD_INLINE void transpose(batch* matrix_begin, batch* matrix_end, requires_arch) noexcept { transpose(reinterpret_cast*>(matrix_begin), reinterpret_cast*>(matrix_end), A {}); } // zip_hi template - XSIMD_INLINE batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept + XSIMD_INLINE batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept { return _mm_unpackhi_ps(self, other); } template ::value, void>::type> - XSIMD_INLINE batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept + XSIMD_INLINE batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { @@ -1619,19 +1549,19 @@ namespace xsimd } } template - XSIMD_INLINE batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept + XSIMD_INLINE batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept { return _mm_unpackhi_pd(self, other); } // zip_lo template - XSIMD_INLINE batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept + XSIMD_INLINE batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept { return _mm_unpacklo_ps(self, other); } template ::value, void>::type> - XSIMD_INLINE batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept + XSIMD_INLINE batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { @@ -1656,7 +1586,7 @@ namespace xsimd } } template - XSIMD_INLINE batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept + XSIMD_INLINE batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept { return _mm_unpacklo_pd(self, other); } From f2246a447d50051003a1452354558b6567798143 Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Sun, 13 Jul 2025 17:51:37 +0200 Subject: [PATCH 08/57] WIP --- .github/workflows/cross-ppc.yml | 2 +- include/xsimd/arch/xsimd_altivec.hpp | 46 +++------------------------- 2 files changed, 5 insertions(+), 43 deletions(-) diff --git a/.github/workflows/cross-ppc.yml b/.github/workflows/cross-ppc.yml index f63383a8f..ec7a11f04 100644 --- a/.github/workflows/cross-ppc.yml +++ b/.github/workflows/cross-ppc.yml @@ -36,7 +36,7 @@ jobs: mkdir _build cd _build && cmake .. -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DBUILD_BENCHMARK=${{ matrix.target.full }} -DBUILD_EXAMPLES=${{ matrix.target.full }} -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_FLAGS="${{ matrix.target.flags }}" -DCMAKE_CXX_FLAGS="${{ matrix.target.flags }}" -DCMAKE_TOOLCHAIN_FILE=${{ github.workspace }}/.github/toolchains/${{ matrix.sys.compiler }}-${{ matrix.target.dir }}.cmake - name: Build - run: cmake --build _build --verbose + run: cmake --build _build --verbose -j1 - name: Testing xsimd run: | qemu-${{ matrix.target.platform }} -cpu 7400 -L /usr/${{ matrix.target.dir}}/ ./test/test_xsimd diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp index 997fb5944..ce6c2a30f 100644 --- a/include/xsimd/arch/xsimd_altivec.hpp +++ b/include/xsimd/arch/xsimd_altivec.hpp @@ -567,53 +567,15 @@ namespace xsimd { return _mm_castsi128_pd(_mm_cmpeq_epi32(_mm_castpd_si128(self), _mm_castpd_si128(other))); } +#endif // first - template - XSIMD_INLINE float first(batch const& self, requires_arch) noexcept - { - return _mm_cvtss_f32(self); - } - - template - XSIMD_INLINE double first(batch const& self, requires_arch) noexcept - { - return _mm_cvtsd_f64(self); - } - - template ::value, void>::type> + template ::value, void>::type> XSIMD_INLINE T first(batch const& self, requires_arch) noexcept { - XSIMD_IF_CONSTEXPR(sizeof(T) == 1) - { - return static_cast(_mm_cvtsi128_si32(self) & 0xFF); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) - { - return static_cast(_mm_cvtsi128_si32(self) & 0xFFFF); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) - { - return static_cast(_mm_cvtsi128_si32(self)); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) - { -#if defined(__x86_64__) - return static_cast(_mm_cvtsi128_si64(self)); -#else - __m128i m; - _mm_storel_epi64(&m, self); - int64_t i; - std::memcpy(&i, &m, sizeof(i)); - return i; -#endif - } - else - { - assert(false && "unsupported arch/op combination"); - return {}; - } + return vec_extract(self, 0); } +#if 0 // from_mask template From f90e5e81d610e5c897d1872e21637fb447af80ab Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Sun, 13 Jul 2025 17:55:33 +0200 Subject: [PATCH 09/57] WIP --- test/test_utils.hpp | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/test/test_utils.hpp b/test/test_utils.hpp index f295a8be6..dff6ede09 100644 --- a/test/test_utils.hpp +++ b/test/test_utils.hpp @@ -571,6 +571,14 @@ namespace xsimd * Testing types lists * ***********************/ +#ifdef XSIMD_WITH_ALTIVEC +#define BATCH_INT_TYPES xsimd::batch, \ + xsimd::batch, \ + xsimd::batch, \ + xsimd::batch, \ + xsimd::batch, \ + xsimd::batch +#else #define BATCH_INT_TYPES xsimd::batch, \ xsimd::batch, \ xsimd::batch, \ @@ -579,13 +587,14 @@ namespace xsimd xsimd::batch, \ xsimd::batch, \ xsimd::batch +#endif -#if XSIMD_WITH_NEON64 || !XSIMD_WITH_NEON +#if XSIMD_WITH_NEON64 || (!XSIMD_WITH_NEON && !XSIMD_WITH_ALTIVEC) #define BATCH_FLOAT_TYPES xsimd::batch, xsimd::batch #else #define BATCH_FLOAT_TYPES xsimd::batch #endif -#if XSIMD_WITH_NEON64 || !XSIMD_WITH_NEON +#if XSIMD_WITH_NEON64 || (!XSIMD_WITH_NEON && !XSIMD_WITH_ALTIVEC) #define BATCH_COMPLEX_TYPES xsimd::batch>, xsimd::batch> #else #define BATCH_COMPLEX_TYPES xsimd::batch> From 6e5978ca7720d93c620f18070da8e70d9dfa54db Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Sun, 13 Jul 2025 18:08:22 +0200 Subject: [PATCH 10/57] WIP --- include/xsimd/arch/xsimd_altivec.hpp | 84 +++++----------------------- 1 file changed, 14 insertions(+), 70 deletions(-) diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp index ce6c2a30f..b5809b7af 100644 --- a/include/xsimd/arch/xsimd_altivec.hpp +++ b/include/xsimd/arch/xsimd_altivec.hpp @@ -179,12 +179,12 @@ namespace xsimd template ::value, void>::type> XSIMD_INLINE batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept { - return vec_andc(other, self); + return vec_nand(self, other); } template ::value, void>::type> XSIMD_INLINE batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { - return vec_andc(other, self); + return vec_nand(self, other); } #if 0 @@ -343,38 +343,18 @@ namespace xsimd } } +#endif // bitwise_xor - template - XSIMD_INLINE batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept - { - return _mm_xor_ps(self, other); - } - template - XSIMD_INLINE batch_bool bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept - { - return _mm_xor_ps(self, other); - } - template ::value, void>::type> + template ::value, void>::type> XSIMD_INLINE batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept { - return _mm_xor_si128(self, other); - } - template - XSIMD_INLINE batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept - { - return _mm_xor_pd(self, other); + return vec_xor(self, other); } - template - XSIMD_INLINE batch_bool bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept - { - return _mm_xor_pd(self, other); - } - template ::value, void>::type> + template ::value, void>::type> XSIMD_INLINE batch bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { - return _mm_xor_si128(self, other); + return vec_xor(self, other); } -#endif // bitwise_cast template @@ -383,44 +363,13 @@ namespace xsimd return *reinterpret_cast::register_type const*>(&self.data); } -#if 0 - // broadcast - template - batch XSIMD_INLINE broadcast(float val, requires_arch) noexcept - { - return _mm_set1_ps(val); - } - template ::value, void>::type> + template ::value, void>::type> XSIMD_INLINE batch broadcast(T val, requires_arch) noexcept { - XSIMD_IF_CONSTEXPR(sizeof(T) == 1) - { - return _mm_set1_epi8(val); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) - { - return _mm_set1_epi16(val); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) - { - return _mm_set1_epi32(val); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) - { - return _mm_set1_epi64x(val); - } - else - { - assert(false && "unsupported arch/op combination"); - return {}; - } - } - template - XSIMD_INLINE batch broadcast(double val, requires_arch) noexcept - { - return _mm_set1_pd(val); + return vec_splats(val); } +#if 0 // store_complex namespace detail @@ -573,7 +522,7 @@ namespace xsimd template ::value, void>::type> XSIMD_INLINE T first(batch const& self, requires_arch) noexcept { - return vec_extract(self, 0); + return vec_extract(self, 0); } #if 0 @@ -752,20 +701,15 @@ namespace xsimd { return self - batch(mask.data); } +#endif // insert template ::value, void>::type> XSIMD_INLINE batch insert(batch const& self, T val, index pos, requires_arch) noexcept { - XSIMD_IF_CONSTEXPR(sizeof(T) == 2) - { - return _mm_insert_epi16(self, val, I); - } - else - { - return insert(self, val, pos, common {}); - } + return vec_insert(val, self, pos); } +#if 0 // isnan template From 153c0580f192347573402f57ec924a67800f4a59 Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Sun, 13 Jul 2025 18:46:56 +0200 Subject: [PATCH 11/57] set --- include/xsimd/arch/xsimd_altivec.hpp | 58 ++-------------------------- 1 file changed, 4 insertions(+), 54 deletions(-) diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp index b5809b7af..b0e53d5c1 100644 --- a/include/xsimd/arch/xsimd_altivec.hpp +++ b/include/xsimd/arch/xsimd_altivec.hpp @@ -1216,65 +1216,15 @@ namespace xsimd { return vec_adds(self, other); } -#if 0 // set - template - XSIMD_INLINE batch set(batch const&, requires_arch, Values... values) noexcept - { - static_assert(sizeof...(Values) == batch::size, "consistent init"); - return _mm_setr_ps(values...); - } - - template ::value, void>::type> - XSIMD_INLINE batch set(batch const&, requires_arch, T v0, T v1) noexcept - { - return _mm_set_epi64x(v1, v0); - } - template ::value, void>::type> - XSIMD_INLINE batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3) noexcept - { - return _mm_setr_epi32(v0, v1, v2, v3); - } - template ::value, void>::type> - XSIMD_INLINE batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) noexcept - { - return _mm_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7); - } - template ::value, void>::type> - XSIMD_INLINE batch set(batch const&, requires_arch, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) noexcept + template + XSIMD_INLINE batch set(batch const&, requires_arch, Values... values) noexcept { - return _mm_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15); + static_assert(sizeof...(Values) == batch::size, "consistent init"); + return typename batch::register_type { values... }; } - template - XSIMD_INLINE batch set(batch const&, requires_arch, Values... values) noexcept - { - static_assert(sizeof...(Values) == batch::size, "consistent init"); - return _mm_setr_pd(values...); - } - - template ::value, void>::type> - XSIMD_INLINE batch_bool set(batch_bool const&, requires_arch, Values... values) noexcept - { - return set(batch(), A {}, static_cast(values ? -1LL : 0LL)...).data; - } - - template - XSIMD_INLINE batch_bool set(batch_bool const&, requires_arch, Values... values) noexcept - { - static_assert(sizeof...(Values) == batch_bool::size, "consistent init"); - return _mm_castsi128_ps(set(batch(), A {}, static_cast(values ? -1LL : 0LL)...).data); - } - - template - XSIMD_INLINE batch_bool set(batch_bool const&, requires_arch, Values... values) noexcept - { - static_assert(sizeof...(Values) == batch_bool::size, "consistent init"); - return _mm_castsi128_pd(set(batch(), A {}, static_cast(values ? -1LL : 0LL)...).data); - } -#endif - // ssub template ::value, void>::type> From ae4f6fe0b20d720afc9c604f26db36ace535adad Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Sun, 13 Jul 2025 18:57:49 +0200 Subject: [PATCH 12/57] eq --- include/xsimd/arch/xsimd_altivec.hpp | 116 ++++----------------------- 1 file changed, 16 insertions(+), 100 deletions(-) diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp index b0e53d5c1..c77f6449e 100644 --- a/include/xsimd/arch/xsimd_altivec.hpp +++ b/include/xsimd/arch/xsimd_altivec.hpp @@ -81,42 +81,21 @@ namespace xsimd return vec_add(self, other); } -#if 0 - // all - template - XSIMD_INLINE bool all(batch_bool const& self, requires_arch) noexcept - { - return _mm_movemask_ps(self) == 0x0F; - } - template - XSIMD_INLINE bool all(batch_bool const& self, requires_arch) noexcept - { - return _mm_movemask_pd(self) == 0x03; - } - template ::value, void>::type> + template ::value, void>::type> XSIMD_INLINE bool all(batch_bool const& self, requires_arch) noexcept { - return _mm_movemask_epi8(self) == 0xFFFF; + return vec_all_ne(self, vec_xor(self, self)); } // any - template - XSIMD_INLINE bool any(batch_bool const& self, requires_arch) noexcept - { - return _mm_movemask_ps(self) != 0; - } - template - XSIMD_INLINE bool any(batch_bool const& self, requires_arch) noexcept - { - return _mm_movemask_pd(self) != 0; - } - template ::value, void>::type> + template ::value, void>::type> XSIMD_INLINE bool any(batch_bool const& self, requires_arch) noexcept { - return _mm_movemask_epi8(self) != 0; + return vec_any_ne(self, vec_xor(self, self)); } +#if 0 // avgr template ::value, void>::type> XSIMD_INLINE batch avgr(batch const& self, batch const& other, requires_arch) noexcept @@ -460,63 +439,19 @@ namespace xsimd return _mm_cvttps_epi32(self); } } +#endif // eq - template - XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept - { - return _mm_cmpeq_ps(self, other); - } - template - XSIMD_INLINE batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept - { - return _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(self), _mm_castps_si128(other))); - } - template ::value, void>::type> + template ::value, void>::type> XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept { - XSIMD_IF_CONSTEXPR(sizeof(T) == 1) - { - return _mm_cmpeq_epi8(self, other); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) - { - return _mm_cmpeq_epi16(self, other); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) - { - return _mm_cmpeq_epi32(self, other); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) - { - __m128i tmp1 = _mm_cmpeq_epi32(self, other); - __m128i tmp2 = _mm_shuffle_epi32(tmp1, 0xB1); - __m128i tmp3 = _mm_and_si128(tmp1, tmp2); - __m128i tmp4 = _mm_srai_epi32(tmp3, 31); - return _mm_shuffle_epi32(tmp4, 0xF5); - } - else - { - assert(false && "unsupported arch/op combination"); - return {}; - } + return vec_cmpeq(self, other); } - template ::value, void>::type> + template ::value, void>::type> XSIMD_INLINE batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { - return ~(self != other); + return vec_cmpeq(self, other); } - template - XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept - { - return _mm_cmpeq_pd(self, other); - } - template - XSIMD_INLINE batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept - { - return _mm_castsi128_pd(_mm_cmpeq_epi32(_mm_castpd_si128(self), _mm_castpd_si128(other))); - } -#endif // first template ::value, void>::type> @@ -987,39 +922,20 @@ namespace xsimd return _mm_xor_pd( self, _mm_castsi128_pd(_mm_setr_epi32(0, 0x80000000, 0, 0x80000000))); } +#endif // neq - template - XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept - { - return _mm_cmpneq_ps(self, other); - } - template ::value, void>::type> + template ::value, void>::type> XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept { - return ~(self == other); + return vec_cmpne(self, other); } - template - XSIMD_INLINE batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept - { - return _mm_xor_ps(self, other); - } - template ::value, void>::type> + template ::value, void>::type> XSIMD_INLINE batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { - return _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(self.data), _mm_castsi128_ps(other.data))); - } - - template - XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept - { - return _mm_cmpneq_pd(self, other); - } - template - XSIMD_INLINE batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept - { - return _mm_xor_pd(self, other); + return vec_cmpne(self, other); } +#if 0 // reciprocal template From fc7d26f16a8024c5e5b370b78f449a506ac78a3b Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Sun, 13 Jul 2025 19:59:01 +0200 Subject: [PATCH 13/57] WIP --- include/xsimd/arch/xsimd_altivec.hpp | 337 ++++----------------------- 1 file changed, 47 insertions(+), 290 deletions(-) diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp index c77f6449e..5f07474f3 100644 --- a/include/xsimd/arch/xsimd_altivec.hpp +++ b/include/xsimd/arch/xsimd_altivec.hpp @@ -166,163 +166,48 @@ namespace xsimd return vec_nand(self, other); } -#if 0 // bitwise_lshift template ::value, void>::type> XSIMD_INLINE batch bitwise_lshift(batch const& self, int32_t other, requires_arch) noexcept { - XSIMD_IF_CONSTEXPR(sizeof(T) == 1) - { - return _mm_and_si128(_mm_set1_epi8(0xFF << other), _mm_slli_epi32(self, other)); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) - { - return _mm_slli_epi16(self, other); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) - { - return _mm_slli_epi32(self, other); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) - { - return _mm_slli_epi64(self, other); - } - else - { - assert(false && "unsupported arch/op combination"); - return {}; - } + using shift_type = as_unsigned_integer_t; + batch shift(static_cast(other)); + return vec_sl(self, shift); } // bitwise_not - template - XSIMD_INLINE batch bitwise_not(batch const& self, requires_arch) noexcept - { - return _mm_xor_ps(self, _mm_castsi128_ps(_mm_set1_epi32(-1))); - } - template - XSIMD_INLINE batch_bool bitwise_not(batch_bool const& self, requires_arch) noexcept - { - return _mm_xor_ps(self, _mm_castsi128_ps(_mm_set1_epi32(-1))); - } - template ::value, void>::type> + template ::value, void>::type> XSIMD_INLINE batch bitwise_not(batch const& self, requires_arch) noexcept { - return _mm_xor_si128(self, _mm_set1_epi32(-1)); + return vec_nor(self, self); } - template ::value, void>::type> + template ::value, void>::type> XSIMD_INLINE batch_bool bitwise_not(batch_bool const& self, requires_arch) noexcept { - return _mm_xor_si128(self, _mm_set1_epi32(-1)); - } - template - XSIMD_INLINE batch bitwise_not(batch const& self, requires_arch) noexcept - { - return _mm_xor_pd(self, _mm_castsi128_pd(_mm_set1_epi32(-1))); - } - template - XSIMD_INLINE batch_bool bitwise_not(batch_bool const& self, requires_arch) noexcept - { - return _mm_xor_pd(self, _mm_castsi128_pd(_mm_set1_epi32(-1))); + return vec_nor(self, self); } // bitwise_or - template - XSIMD_INLINE batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept - { - return _mm_or_ps(self, other); - } - template - XSIMD_INLINE batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept - { - return _mm_or_ps(self, other); - } - template ::value, void>::type> + template ::value, void>::type> XSIMD_INLINE batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept { - return _mm_or_si128(self, other); + return vec_or(self, other); } - template ::value, void>::type> + template ::value, void>::type> XSIMD_INLINE batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { - return _mm_or_si128(self, other); - } - - template - XSIMD_INLINE batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept - { - return _mm_or_pd(self, other); - } - - template - XSIMD_INLINE batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept - { - return _mm_or_pd(self, other); + return vec_or(self, other); } // bitwise_rshift template ::value, void>::type> XSIMD_INLINE batch bitwise_rshift(batch const& self, int32_t other, requires_arch) noexcept { - if (std::is_signed::value) - { - XSIMD_IF_CONSTEXPR(sizeof(T) == 1) - { - __m128i sign_mask = _mm_set1_epi16((0xFF00 >> other) & 0x00FF); - __m128i cmp_is_negative = _mm_cmpgt_epi8(_mm_setzero_si128(), self); - __m128i res = _mm_srai_epi16(self, other); - return _mm_or_si128(_mm_and_si128(sign_mask, cmp_is_negative), _mm_andnot_si128(sign_mask, res)); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) - { - return _mm_srai_epi16(self, other); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) - { - return _mm_srai_epi32(self, other); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) - { - // from https://github.com/samyvilar/vect/blob/master/vect_128.h - return _mm_or_si128( - _mm_srli_epi64(self, other), - _mm_slli_epi64( - _mm_srai_epi32(_mm_shuffle_epi32(self, _MM_SHUFFLE(3, 3, 1, 1)), 32), - 64 - other)); - } - else - { - assert(false && "unsupported arch/op combination"); - return {}; - } - } - else - { - XSIMD_IF_CONSTEXPR(sizeof(T) == 1) - { - return _mm_and_si128(_mm_set1_epi8(0xFF >> other), _mm_srli_epi32(self, other)); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) - { - return _mm_srli_epi16(self, other); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) - { - return _mm_srli_epi32(self, other); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) - { - return _mm_srli_epi64(self, other); - } - else - { - assert(false && "unsupported arch/op combination"); - return {}; - } - } + using shift_type = as_unsigned_integer_t; + batch shift(static_cast(other)); + return vec_sr(self, shift); } -#endif // bitwise_xor template ::value, void>::type> XSIMD_INLINE batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept @@ -556,59 +441,33 @@ namespace xsimd return _mm_castpd_si128(from_mask(batch_bool {}, mask, altivec {})); } } - +#endif // ge - template - XSIMD_INLINE batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept + template ::value, void>::type> + XSIMD_INLINE batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept { - return _mm_cmpge_ps(self, other); + return vec_cmpge(self, other); } template XSIMD_INLINE batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept { - return _mm_cmpge_pd(self, other); + return vec_cmpge(self, other); } // gt - template - XSIMD_INLINE batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept - { - return _mm_cmpgt_ps(self, other); - } template ::value, void>::type> XSIMD_INLINE batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept { - if (std::is_signed::value) - { - XSIMD_IF_CONSTEXPR(sizeof(T) == 1) - { - return _mm_cmpgt_epi8(self, other); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) - { - return _mm_cmpgt_epi16(self, other); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) - { - return _mm_cmpgt_epi32(self, other); - } - else - { - return gt(self, other, common {}); - } - } - else - { - return gt(self, other, common {}); - } + return vec_cmpgt(self, other); } - template XSIMD_INLINE batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept { - return _mm_cmpgt_pd(self, other); + return vec_cmpgt(self, other); } +#if 0 + // haddp template XSIMD_INLINE batch haddp(batch const* row, requires_arch) noexcept @@ -644,21 +503,14 @@ namespace xsimd { return vec_insert(val, self, pos); } -#if 0 // isnan template XSIMD_INLINE batch_bool isnan(batch const& self, requires_arch) noexcept { - return _mm_cmpunord_ps(self, self); - } - template - XSIMD_INLINE batch_bool isnan(batch const& self, requires_arch) noexcept - { - return _mm_cmpunord_pd(self, self); + return ~vec_cmpeq(self, self); } -#endif // load_aligned template ::value, void>::type> XSIMD_INLINE batch load_aligned(T const* mem, convert, requires_arch) noexcept @@ -690,99 +542,35 @@ namespace xsimd } } #endif -#if 0 // le - template - XSIMD_INLINE batch_bool le(batch const& self, batch const& other, requires_arch) noexcept + template ::value, void>::type> + XSIMD_INLINE batch_bool le(batch const& self, batch const& other, requires_arch) noexcept { - return _mm_cmple_ps(self, other); + return vec_cmple(self, other); } template XSIMD_INLINE batch_bool le(batch const& self, batch const& other, requires_arch) noexcept { - return _mm_cmple_pd(self, other); + return vec_cmple(self, other); } // lt - template - XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept - { - return _mm_cmplt_ps(self, other); - } template ::value, void>::type> XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept { - if (std::is_signed::value) - { - XSIMD_IF_CONSTEXPR(sizeof(T) == 1) - { - return _mm_cmplt_epi8(self, other); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) - { - return _mm_cmplt_epi16(self, other); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) - { - return _mm_cmplt_epi32(self, other); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) - { - __m128i tmp1 = _mm_sub_epi64(self, other); - __m128i tmp2 = _mm_xor_si128(self, other); - __m128i tmp3 = _mm_andnot_si128(other, self); - __m128i tmp4 = _mm_andnot_si128(tmp2, tmp1); - __m128i tmp5 = _mm_or_si128(tmp3, tmp4); - __m128i tmp6 = _mm_srai_epi32(tmp5, 31); - return _mm_shuffle_epi32(tmp6, 0xF5); - } - else - { - assert(false && "unsupported arch/op combination"); - return {}; - } - } - else - { - XSIMD_IF_CONSTEXPR(sizeof(T) == 1) - { - return _mm_cmplt_epi8(_mm_xor_si128(self, _mm_set1_epi8(std::numeric_limits::lowest())), _mm_xor_si128(other, _mm_set1_epi8(std::numeric_limits::lowest()))); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) - { - return _mm_cmplt_epi16(_mm_xor_si128(self, _mm_set1_epi16(std::numeric_limits::lowest())), _mm_xor_si128(other, _mm_set1_epi16(std::numeric_limits::lowest()))); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) - { - return _mm_cmplt_epi32(_mm_xor_si128(self, _mm_set1_epi32(std::numeric_limits::lowest())), _mm_xor_si128(other, _mm_set1_epi32(std::numeric_limits::lowest()))); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) - { - auto xself = _mm_xor_si128(self, _mm_set1_epi64x(std::numeric_limits::lowest())); - auto xother = _mm_xor_si128(other, _mm_set1_epi64x(std::numeric_limits::lowest())); - __m128i tmp1 = _mm_sub_epi64(xself, xother); - __m128i tmp2 = _mm_xor_si128(xself, xother); - __m128i tmp3 = _mm_andnot_si128(xother, xself); - __m128i tmp4 = _mm_andnot_si128(tmp2, tmp1); - __m128i tmp5 = _mm_or_si128(tmp3, tmp4); - __m128i tmp6 = _mm_srai_epi32(tmp5, 31); - return _mm_shuffle_epi32(tmp6, 0xF5); - } - else - { - assert(false && "unsupported arch/op combination"); - return {}; - } - } + return vec_cmplt(self, other); } - template XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept { - return _mm_cmplt_pd(self, other); + return vec_cmplt(self, other); } +#if 0 + + + /* compression table to turn 0b10 into 0b1, * 0b100010 into 0b101 etc */ @@ -853,40 +641,21 @@ namespace xsimd return _mm_movemask_pd(self); } +#endif + // max - template - XSIMD_INLINE batch max(batch const& self, batch const& other, requires_arch) noexcept - { - return _mm_max_ps(self, other); - } - template ::value, void>::type> + template ::value, void>::type> XSIMD_INLINE batch max(batch const& self, batch const& other, requires_arch) noexcept { - return select(self > other, self, other); - } - template - XSIMD_INLINE batch max(batch const& self, batch const& other, requires_arch) noexcept - { - return _mm_max_pd(self, other); + return vec_max(self, other); } // min - template - XSIMD_INLINE batch min(batch const& self, batch const& other, requires_arch) noexcept - { - return _mm_min_ps(self, other); - } - template ::value, void>::type> + template ::value, void>::type> XSIMD_INLINE batch min(batch const& self, batch const& other, requires_arch) noexcept { - return select(self <= other, self, other); + return vec_min(self, other); } - template - XSIMD_INLINE batch min(batch const& self, batch const& other, requires_arch) noexcept - { - return _mm_min_pd(self, other); - } -#endif // mul template ::value, void>::type> @@ -903,48 +672,36 @@ namespace xsimd { return _mm_cvtps_epi32(self); } +#endif // neg - template ::value, void>::type> + template ::value, void>::type> XSIMD_INLINE batch neg(batch const& self, requires_arch) noexcept { - return 0 - self; - } - template - XSIMD_INLINE batch neg(batch const& self, requires_arch) noexcept - { - return _mm_xor_ps(self, _mm_castsi128_ps(_mm_set1_epi32(0x80000000))); + return vec_neg(self); } - template - XSIMD_INLINE batch neg(batch const& self, requires_arch) noexcept - { - return _mm_xor_pd( - self, _mm_castsi128_pd(_mm_setr_epi32(0, 0x80000000, 0, 0x80000000))); - } -#endif - // neq template ::value, void>::type> XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept { - return vec_cmpne(self, other); + return ~vec_cmpeq(self, other); } template ::value, void>::type> XSIMD_INLINE batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { - return vec_cmpne(self, other); + return ~vec_cmpeq(self, other); } -#if 0 // reciprocal template XSIMD_INLINE batch reciprocal(batch const& self, kernel::requires_arch) { - return _mm_rcp_ps(self); + return vec_re(self); } +#if 0 // reduce_add template XSIMD_INLINE float reduce_add(batch const& self, requires_arch) noexcept From f6a2a1bc8b13eff992c6ac3f80de848c8662a6e9 Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Sun, 13 Jul 2025 20:19:07 +0200 Subject: [PATCH 14/57] WIP --- include/xsimd/arch/xsimd_altivec.hpp | 60 +++++++--------------------- 1 file changed, 14 insertions(+), 46 deletions(-) diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp index 5f07474f3..d1f3047ef 100644 --- a/include/xsimd/arch/xsimd_altivec.hpp +++ b/include/xsimd/arch/xsimd_altivec.hpp @@ -95,45 +95,21 @@ namespace xsimd return vec_any_ne(self, vec_xor(self, self)); } -#if 0 // avgr - template ::value, void>::type> + template ::value, void>::type> XSIMD_INLINE batch avgr(batch const& self, batch const& other, requires_arch) noexcept { - XSIMD_IF_CONSTEXPR(sizeof(T) == 1) - { - return _mm_avg_epu8(self, other); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) - { - return _mm_avg_epu16(self, other); - } - else - { - return avgr(self, other, common {}); - } + return vec_avg(self, other); } // avg - template ::value, void>::type> + template ::value, void>::type> XSIMD_INLINE batch avg(batch const& self, batch const& other, requires_arch) noexcept { - XSIMD_IF_CONSTEXPR(sizeof(T) == 1) - { - auto adj = ((self ^ other) << 7) >> 7; - return avgr(self, other, A {}) - adj; - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) - { - auto adj = ((self ^ other) << 15) >> 15; - return avgr(self, other, A {}) - adj; - } - else - { - return avg(self, other, common {}); - } + constexpr auto nbit = 8 * sizeof(T) - 1; + constexpr auto adj = ((self ^ other) << nbit) >> nbit; + return avgr(self, other, A {}) - adj; } -#endif // batch_bool_cast template @@ -482,12 +458,7 @@ namespace xsimd tmp0 = _mm_movelh_ps(tmp0, tmp1); return _mm_add_ps(tmp0, tmp2); } - template - XSIMD_INLINE batch haddp(batch const* row, requires_arch) noexcept - { - return _mm_add_pd(_mm_unpacklo_pd(row[0], row[1]), - _mm_unpackhi_pd(row[0], row[1])); - } +#endif // incr_if template ::value, void>::type> @@ -495,7 +466,6 @@ namespace xsimd { return self - batch(mask.data); } -#endif // insert template ::value, void>::type> @@ -522,7 +492,7 @@ namespace xsimd template ::value, void>::type> XSIMD_INLINE batch load_unaligned(T const* mem, convert, requires_arch) noexcept { - return *(typename batch::register_type)mem; + return batch(*(typename batch::register_type)mem); } #if 0 @@ -791,19 +761,17 @@ namespace xsimd batch acc3 = min(acc2, step3); return first(acc3, A {}); } +#endif // rsqrt template XSIMD_INLINE batch rsqrt(batch const& val, requires_arch) noexcept { - return _mm_rsqrt_ps(val); - } - template - XSIMD_INLINE batch rsqrt(batch const& val, requires_arch) noexcept - { - return _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(val))); + return vec_rsqrt(val); } +#if 0 + // select template XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept @@ -917,14 +885,14 @@ namespace xsimd template ::value, void>::type> XSIMD_INLINE void store_aligned(T* mem, batch const& self, requires_arch) noexcept { - return vec_st(self, 0, mem); + return vec_st(self.data, 0, mem); } // store_unaligned template ::value, void>::type> XSIMD_INLINE void store_unaligned(T* mem, batch const& self, requires_arch) noexcept { - *(typename batch::register_type)mem = self; + *(typename batch::register_type)mem = self.data; } // sub From c19b1cb2e40d58fbf1b6eb2abe9ad7c81de45d0e Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Sun, 13 Jul 2025 20:24:31 +0200 Subject: [PATCH 15/57] WIP --- include/xsimd/arch/xsimd_altivec.hpp | 97 ++++------------------------ 1 file changed, 11 insertions(+), 86 deletions(-) diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp index d1f3047ef..0d019150b 100644 --- a/include/xsimd/arch/xsimd_altivec.hpp +++ b/include/xsimd/arch/xsimd_altivec.hpp @@ -770,30 +770,18 @@ namespace xsimd return vec_rsqrt(val); } -#if 0 - // select - template - XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept - { - return _mm_or_ps(_mm_and_ps(cond, true_br), _mm_andnot_ps(cond, false_br)); - } - - template ::value, void>::type> + template ::value, void>::type> XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept { - return _mm_or_si128(_mm_and_si128(cond, true_br), _mm_andnot_si128(cond, false_br)); + return vec_sel(true_br, false_br, cond); } - template ::value, void>::type> + template ::value, void>::type> XSIMD_INLINE batch select(batch_bool_constant const&, batch const& true_br, batch const& false_br, requires_arch) noexcept { return select(batch_bool { Values... }, true_br, false_br, altivec {}); } - template - XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept - { - return _mm_or_pd(_mm_and_pd(cond, true_br), _mm_andnot_pd(cond, false_br)); - } +#if 0 // shuffle template @@ -823,18 +811,15 @@ namespace xsimd return _mm_shuffle_pd(y, x, smask); return shuffle(x, y, mask, common {}); } +#endif // sqrt template XSIMD_INLINE batch sqrt(batch const& val, requires_arch) noexcept { - return _mm_sqrt_ps(val); - } - template - XSIMD_INLINE batch sqrt(batch const& val, requires_arch) noexcept - { - return _mm_sqrt_pd(val); + return vec_sqrt(val); } +#if 0 // slide_left template @@ -1014,80 +999,20 @@ namespace xsimd transpose(reinterpret_cast*>(matrix_begin), reinterpret_cast*>(matrix_end), A {}); } +#endif // zip_hi - template - XSIMD_INLINE batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept - { - return _mm_unpackhi_ps(self, other); - } template ::value, void>::type> XSIMD_INLINE batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept { - XSIMD_IF_CONSTEXPR(sizeof(T) == 1) - { - return _mm_unpackhi_epi8(self, other); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) - { - return _mm_unpackhi_epi16(self, other); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) - { - return _mm_unpackhi_epi32(self, other); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) - { - return _mm_unpackhi_epi64(self, other); - } - else - { - assert(false && "unsupported arch/op combination"); - return {}; - } - } - template - XSIMD_INLINE batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept - { - return _mm_unpackhi_pd(self, other); + return vec_merge_hi(self, other); } // zip_lo - template - XSIMD_INLINE batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept - { - return _mm_unpacklo_ps(self, other); - } - template ::value, void>::type> + template ::value, void>::type> XSIMD_INLINE batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept { - XSIMD_IF_CONSTEXPR(sizeof(T) == 1) - { - return _mm_unpacklo_epi8(self, other); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) - { - return _mm_unpacklo_epi16(self, other); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 4) - { - return _mm_unpacklo_epi32(self, other); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) - { - return _mm_unpacklo_epi64(self, other); - } - else - { - assert(false && "unsupported arch/op combination"); - return {}; - } + return vec_mergel(self, other); } - template - XSIMD_INLINE batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept - { - return _mm_unpacklo_pd(self, other); - } -#endif } } From f8fa5857960292f9f7cc96f7670bca8a177bafda Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Sun, 13 Jul 2025 20:31:47 +0200 Subject: [PATCH 16/57] WIP --- include/xsimd/arch/xsimd_altivec.hpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp index 0d019150b..cd7d74408 100644 --- a/include/xsimd/arch/xsimd_altivec.hpp +++ b/include/xsimd/arch/xsimd_altivec.hpp @@ -492,7 +492,9 @@ namespace xsimd template ::value, void>::type> XSIMD_INLINE batch load_unaligned(T const* mem, convert, requires_arch) noexcept { - return batch(*(typename batch::register_type)mem); + auto lo = vec_ld(0, mem); + auto hi = vec_ld(16, mem); + return vec_perm(lo, hi, vec_lvsl(0, mem)); } #if 0 From 2de3a6bf3e4ee7de07f2e26ecb6c607f2bab9b6f Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Sun, 13 Jul 2025 20:43:33 +0200 Subject: [PATCH 17/57] store --- include/xsimd/arch/xsimd_altivec.hpp | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp index cd7d74408..3705c2e3a 100644 --- a/include/xsimd/arch/xsimd_altivec.hpp +++ b/include/xsimd/arch/xsimd_altivec.hpp @@ -879,7 +879,24 @@ namespace xsimd template ::value, void>::type> XSIMD_INLINE void store_unaligned(T* mem, batch const& self, requires_arch) noexcept { - *(typename batch::register_type)mem = self.data; + // From: https://stackoverflow.com/questions/35317341/how-to-store-a-vector-to-an-unaligned-location-in-memory-with-altivec + // Load the surrounding area + auto low = vec_ld(0, dst); + auto high = vec_ld(16, dst); + // Prepare the constants that we need + auto permuteVector = vec_lvsr(0, (int*)mem); + auto oxFF = vec_splat_s8(-1); + auto ox00 = vec_splat_s8(0); + // Make a mask for which parts of the vectors to swap out + auto mask = vec_perm(ox00, oxFF, permuteVector); + // Right rotate our input data + v = vec_perm(self, self, permuteVector); + // Insert our data into the low and high vectors + low = vec_sel(self, low, mask); + high = vec_sel(high, self, mask); + // Store the two aligned result vectors + vec_st(low, 0, mem); + vec_st(high, 16, mem); } // sub From d66759f7a19b25e5661c3b97e0311f94de1be2a5 Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Sun, 13 Jul 2025 20:45:43 +0200 Subject: [PATCH 18/57] ++ --- include/xsimd/arch/xsimd_altivec.hpp | 54 +++++++--------------------- 1 file changed, 13 insertions(+), 41 deletions(-) diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp index 3705c2e3a..1578151f3 100644 --- a/include/xsimd/arch/xsimd_altivec.hpp +++ b/include/xsimd/arch/xsimd_altivec.hpp @@ -122,12 +122,12 @@ namespace xsimd template ::value, void>::type> XSIMD_INLINE batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept { - return vec_and(self, other); + return vec_and(self.data, other.data); } template ::value, void>::type> XSIMD_INLINE batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { - return vec_and(self, other); + return vec_and(self.data, other.data); } // bitwise_andnot @@ -239,6 +239,8 @@ namespace xsimd } } +#endif + // decr_if template ::value, void>::type> XSIMD_INLINE batch decr_if(batch const& self, batch_bool const& mask, requires_arch) noexcept @@ -247,17 +249,14 @@ namespace xsimd } // div - template - XSIMD_INLINE batch div(batch const& self, batch const& other, requires_arch) noexcept - { - return _mm_div_ps(self, other); - } - template - XSIMD_INLINE batch div(batch const& self, batch const& other, requires_arch) noexcept + template ::value, void>::type> + XSIMD_INLINE batch div(batch const& self, batch const& other, requires_arch) noexcept { - return _mm_div_pd(self, other); + return vec_div(self, other); } +#if 0 + // fast_cast namespace detail { @@ -267,33 +266,6 @@ namespace xsimd return _mm_cvtepi32_ps(self); } - template - XSIMD_INLINE batch fast_cast(batch const& x, batch const&, requires_arch) noexcept - { - // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx - // adapted to altivec - __m128i xH = _mm_srli_epi64(x, 32); - xH = _mm_or_si128(xH, _mm_castpd_si128(_mm_set1_pd(19342813113834066795298816.))); // 2^84 - __m128i mask = _mm_setr_epi16(0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000); - __m128i xL = _mm_or_si128(_mm_and_si128(mask, x), _mm_andnot_si128(mask, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)))); // 2^52 - __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.)); // 2^84 + 2^52 - return _mm_add_pd(f, _mm_castsi128_pd(xL)); - } - - template - XSIMD_INLINE batch fast_cast(batch const& x, batch const&, requires_arch) noexcept - { - // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx - // adapted to altivec - __m128i xH = _mm_srai_epi32(x, 16); - xH = _mm_and_si128(xH, _mm_setr_epi16(0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF)); - xH = _mm_add_epi64(xH, _mm_castpd_si128(_mm_set1_pd(442721857769029238784.))); // 3*2^67 - __m128i mask = _mm_setr_epi16(0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000); - __m128i xL = _mm_or_si128(_mm_and_si128(mask, x), _mm_andnot_si128(mask, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)))); // 2^52 - __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(442726361368656609280.)); // 3*2^67 + 2^52 - return _mm_add_pd(f, _mm_castsi128_pd(xL)); - } - template XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { @@ -306,12 +278,12 @@ namespace xsimd template ::value, void>::type> XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept { - return vec_cmpeq(self, other); + return vec_cmpeq(self.data, other.data); } template ::value, void>::type> XSIMD_INLINE batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { - return vec_cmpeq(self, other); + return vec_cmpeq(self.data, other.data); } // first @@ -881,8 +853,8 @@ namespace xsimd { // From: https://stackoverflow.com/questions/35317341/how-to-store-a-vector-to-an-unaligned-location-in-memory-with-altivec // Load the surrounding area - auto low = vec_ld(0, dst); - auto high = vec_ld(16, dst); + auto low = vec_ld(0, mem); + auto high = vec_ld(16, mem); // Prepare the constants that we need auto permuteVector = vec_lvsr(0, (int*)mem); auto oxFF = vec_splat_s8(-1); From 540e0d277b9a5118059fd6b037c6305f79ef08ca Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Sun, 13 Jul 2025 21:24:56 +0200 Subject: [PATCH 19/57] WIP --- include/xsimd/arch/xsimd_altivec.hpp | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp index 1578151f3..bbcf7f7bb 100644 --- a/include/xsimd/arch/xsimd_altivec.hpp +++ b/include/xsimd/arch/xsimd_altivec.hpp @@ -414,23 +414,23 @@ namespace xsimd return vec_cmpgt(self, other); } -#if 0 - // haddp template XSIMD_INLINE batch haddp(batch const* row, requires_arch) noexcept { - __m128 tmp0 = _mm_unpacklo_ps(row[0], row[1]); - __m128 tmp1 = _mm_unpackhi_ps(row[0], row[1]); - __m128 tmp2 = _mm_unpackhi_ps(row[2], row[3]); - tmp0 = _mm_add_ps(tmp0, tmp1); - tmp1 = _mm_unpacklo_ps(row[2], row[3]); - tmp1 = _mm_add_ps(tmp1, tmp2); - tmp2 = _mm_movehl_ps(tmp1, tmp0); - tmp0 = _mm_movelh_ps(tmp0, tmp1); - return _mm_add_ps(tmp0, tmp2); + auto tmp0 = vec_mergee(row[0], row[1]); // v00 v10 v02 v12 + auto tmp1 = vec_mergeo(row[0], row[1]); // v01 v11 v03 v13 + auto tmp4 = vec_add(tmp0, tmp1); // (v00 + v01, v10 + v11, v02 + v03, v12 + v13) + + auto tmp2 = vec_mergee(row[2], row[3]); // v20 v30 v22 v32 + auto tmp3 = vec_mergeo(row[2], row[3]); // v21 v31 v23 v33 + auto tmp5 = vec_add(tmp0, tmp1); // (v20 + v21, v30 + v31, v22 + v23, v32 + v33) + + auto tmp6 = vec_permi(tmp4, tmp5, 0x0); // (v00 + v01, v10 + v11, v20 + v21, v30 + v31 + auto tmp7 = vec_permi(tmp4, tmp5, 0x3); // (v02 + v03, v12 + v13, v12 + v13, v32 + v33) + + return vec_add(tmp6, tmp7); } -#endif // incr_if template ::value, void>::type> From 68ca9a97d7eb29c19ae2fb83bec1cf18316b17e6 Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Sun, 13 Jul 2025 21:32:48 +0200 Subject: [PATCH 20/57] WIP --- include/xsimd/arch/xsimd_altivec.hpp | 46 +++++++++++++--------------- 1 file changed, 21 insertions(+), 25 deletions(-) diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp index bbcf7f7bb..a98335a94 100644 --- a/include/xsimd/arch/xsimd_altivec.hpp +++ b/include/xsimd/arch/xsimd_altivec.hpp @@ -278,12 +278,14 @@ namespace xsimd template ::value, void>::type> XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept { - return vec_cmpeq(self.data, other.data); + auto res = vec_cmpeq(self.data, other.data); + return *reinterpret_cast::register_type*>(&res); } template ::value, void>::type> XSIMD_INLINE batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { - return vec_cmpeq(self.data, other.data); + auto res = vec_cmpeq(self.data, other.data); + return *reinterpret_cast::register_type*>(&res); } // first @@ -793,23 +795,21 @@ namespace xsimd { return vec_sqrt(val); } -#if 0 // slide_left template XSIMD_INLINE batch slide_left(batch const& x, requires_arch) noexcept { - return _mm_slli_si128(x, N); + return vec_sll(x, vec_splat_u8(N)); } // slide_right template XSIMD_INLINE batch slide_right(batch const& x, requires_arch) noexcept { - return _mm_srli_si128(x, N); + return vec_srl(x, vec_splat_u8(N)); } -#endif // sadd template ::value, void>::type> XSIMD_INLINE batch sadd(batch const& self, batch const& other, requires_arch) noexcept @@ -819,12 +819,18 @@ namespace xsimd // set template - XSIMD_INLINE batch set(batch const&, requires_arch, Values... values) noexcept + XSIMD_INLINE batch set(batch const&, requires_arch, Values... values) noexcept { static_assert(sizeof...(Values) == batch::size, "consistent init"); return typename batch::register_type { values... }; } + template ::value, void>::type> + XSIMD_INLINE batch_bool set(batch_bool const&, requires_arch, Values... values) noexcept + { + return set(batch(), A {}, static_cast(values ? -1LL : 0LL)...).data; + } + // ssub template ::value, void>::type> @@ -851,24 +857,14 @@ namespace xsimd template ::value, void>::type> XSIMD_INLINE void store_unaligned(T* mem, batch const& self, requires_arch) noexcept { - // From: https://stackoverflow.com/questions/35317341/how-to-store-a-vector-to-an-unaligned-location-in-memory-with-altivec - // Load the surrounding area - auto low = vec_ld(0, mem); - auto high = vec_ld(16, mem); - // Prepare the constants that we need - auto permuteVector = vec_lvsr(0, (int*)mem); - auto oxFF = vec_splat_s8(-1); - auto ox00 = vec_splat_s8(0); - // Make a mask for which parts of the vectors to swap out - auto mask = vec_perm(ox00, oxFF, permuteVector); - // Right rotate our input data - v = vec_perm(self, self, permuteVector); - // Insert our data into the low and high vectors - low = vec_sel(self, low, mask); - high = vec_sel(high, self, mask); - // Store the two aligned result vectors - vec_st(low, 0, mem); - vec_st(high, 16, mem); + auto tmp = vec_perm(*reinterpret_cast(&self.data), *reinterpret_cast(&self.data), vec_lvsr(0, (unsigned char*)mem)); + vec_ste((__vector unsigned char)tmp, 0, (unsigned char*)mem); + vec_ste((__vector unsigned short)tmp, 1, (unsigned short*)mem); + vec_ste((__vector unsigned int)tmp, 3, (unsigned int*)mem); + vec_ste((__vector unsigned int)tmp, 4, (unsigned int*)mem); + vec_ste((__vector unsigned int)tmp, 8, (unsigned int*)mem); + vec_ste((__vector unsigned int)tmp, 12, (unsigned int*)mem); + vec_ste((__vector unsigned short)tmp, 14, (unsigned short*)mem); } // sub From b5ab4f1a4a5da9090b1a66cfe8dbe11c9a1af5dc Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Sun, 13 Jul 2025 23:12:47 +0200 Subject: [PATCH 21/57] WIP --- include/xsimd/arch/xsimd_altivec.hpp | 43 ++++++---------------------- 1 file changed, 8 insertions(+), 35 deletions(-) diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp index a98335a94..7f16f092d 100644 --- a/include/xsimd/arch/xsimd_altivec.hpp +++ b/include/xsimd/arch/xsimd_altivec.hpp @@ -647,40 +647,18 @@ namespace xsimd return vec_re(self); } -#if 0 // reduce_add - template - XSIMD_INLINE float reduce_add(batch const& self, requires_arch) noexcept - { - __m128 tmp0 = _mm_add_ps(self, _mm_movehl_ps(self, self)); - __m128 tmp1 = _mm_add_ss(tmp0, _mm_shuffle_ps(tmp0, tmp0, 1)); - return _mm_cvtss_f32(tmp1); - } - - template ::value, void>::type> + template ::value, void>::type> XSIMD_INLINE T reduce_add(batch const& self, requires_arch) noexcept { XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { - __m128i tmp1 = _mm_shuffle_epi32(self, 0x0E); - __m128i tmp2 = _mm_add_epi32(self, tmp1); - __m128i tmp3 = _mm_shuffle_epi32(tmp2, 0x01); - __m128i tmp4 = _mm_add_epi32(tmp2, tmp3); - return _mm_cvtsi128_si32(tmp4); - } - else XSIMD_IF_CONSTEXPR(sizeof(T) == 8) - { - __m128i tmp1 = _mm_shuffle_epi32(self, 0x0E); - __m128i tmp2 = _mm_add_epi64(self, tmp1); -#if defined(__x86_64__) - return _mm_cvtsi128_si64(tmp2); -#else - __m128i m; - _mm_storel_epi64(&m, tmp2); - int64_t i; - std::memcpy(&i, &m, sizeof(i)); - return i; -#endif + // FIXME: fine an in-order approach + auto tmp0 = vec_reve(self); // v3, v2, v1, v0 + auto tmp1 = vec_add(self.data, tmp0); // v0 + v3, v1 + v2, v2 + v1, v3 + v0 + auto tmp2 = vec_permi(tmp1, tmp1, 0x3); // v2 + v1, v3 + v0, v2 + v1, v3 + v0 + auto tmp3 = vec_add(tmp1, tmp2); + return vec_extract(tmp3, 0); } else { @@ -688,12 +666,7 @@ namespace xsimd } } - template - XSIMD_INLINE double reduce_add(batch const& self, requires_arch) noexcept - { - return _mm_cvtsd_f64(_mm_add_sd(self, _mm_unpackhi_pd(self, self))); - } - +#if 0 // reduce_max template ::type> XSIMD_INLINE T reduce_max(batch const& self, requires_arch) noexcept From 1bd2ce7fdbc9ca58b4f3aab1814ecfdf31a1689f Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Sun, 13 Jul 2025 23:31:20 +0200 Subject: [PATCH 22/57] fast-cast --- include/xsimd/arch/xsimd_altivec.hpp | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp index 7f16f092d..da00dab14 100644 --- a/include/xsimd/arch/xsimd_altivec.hpp +++ b/include/xsimd/arch/xsimd_altivec.hpp @@ -255,24 +255,32 @@ namespace xsimd return vec_div(self, other); } -#if 0 - // fast_cast namespace detail { template XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { - return _mm_cvtepi32_ps(self); + return vec_ctf(self.data, 0); + } + template + XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch) noexcept + { + return vec_ctf(self.data, 0); } template XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch) noexcept { - return _mm_cvttps_epi32(self); + return vec_cts(self.data, 0); + } + + template + XSIMD_INLINE batch fast_cast(batch const& self, batch const&, requires_arch) noexcept + { + return vec_ctu(self.data, 0); } } -#endif // eq template ::value, void>::type> From 36cd50cdd57a7771394ca3b604eec6e2741ee375 Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Mon, 14 Jul 2025 08:36:31 +0200 Subject: [PATCH 23/57] WIP --- include/xsimd/arch/xsimd_altivec.hpp | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp index da00dab14..b378a9e94 100644 --- a/include/xsimd/arch/xsimd_altivec.hpp +++ b/include/xsimd/arch/xsimd_altivec.hpp @@ -78,7 +78,7 @@ namespace xsimd template ::value, void>::type> XSIMD_INLINE batch add(batch const& self, batch const& other, requires_arch) noexcept { - return vec_add(self, other); + return vec_add(self.data, other.data); } // all @@ -661,8 +661,7 @@ namespace xsimd { XSIMD_IF_CONSTEXPR(sizeof(T) == 4) { - // FIXME: fine an in-order approach - auto tmp0 = vec_reve(self); // v3, v2, v1, v0 + auto tmp0 = vec_reve(self.data); // v3, v2, v1, v0 auto tmp1 = vec_add(self.data, tmp0); // v0 + v3, v1 + v2, v2 + v1, v3 + v0 auto tmp2 = vec_permi(tmp1, tmp1, 0x3); // v2 + v1, v3 + v0, v2 + v1, v3 + v0 auto tmp3 = vec_add(tmp1, tmp2); @@ -673,6 +672,16 @@ namespace xsimd return hadd(self, common {}); } } + template + XSIMD_INLINE float reduce_add(batch const& self, requires_arch) noexcept + { + // FIXME: find an in-order approach + auto tmp0 = vec_reve(self.data); // v3, v2, v1, v0 + auto tmp1 = vec_add(self.data, tmp0); // v0 + v3, v1 + v2, v2 + v1, v3 + v0 + auto tmp2 = vec_permi(tmp1, tmp1, 0x3); // v2 + v1, v3 + v0, v2 + v1, v3 + v0 + auto tmp3 = vec_add(tmp1, tmp2); + return vec_extract(tmp3, 0); + } #if 0 // reduce_max From 37988e7b76a533f38ff7bbd56e8b178f8477aa86 Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Mon, 14 Jul 2025 08:47:43 +0200 Subject: [PATCH 24/57] WIP --- include/xsimd/arch/xsimd_altivec.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp index b378a9e94..9312c8602 100644 --- a/include/xsimd/arch/xsimd_altivec.hpp +++ b/include/xsimd/arch/xsimd_altivec.hpp @@ -663,7 +663,7 @@ namespace xsimd { auto tmp0 = vec_reve(self.data); // v3, v2, v1, v0 auto tmp1 = vec_add(self.data, tmp0); // v0 + v3, v1 + v2, v2 + v1, v3 + v0 - auto tmp2 = vec_permi(tmp1, tmp1, 0x3); // v2 + v1, v3 + v0, v2 + v1, v3 + v0 + auto tmp2 = vec_mergeh(tmp1, tmp1); // v2 + v1, v2 + v1, v3 + v0, v3 + v0 auto tmp3 = vec_add(tmp1, tmp2); return vec_extract(tmp3, 0); } @@ -678,7 +678,7 @@ namespace xsimd // FIXME: find an in-order approach auto tmp0 = vec_reve(self.data); // v3, v2, v1, v0 auto tmp1 = vec_add(self.data, tmp0); // v0 + v3, v1 + v2, v2 + v1, v3 + v0 - auto tmp2 = vec_permi(tmp1, tmp1, 0x3); // v2 + v1, v3 + v0, v2 + v1, v3 + v0 + auto tmp2 = vec_mergeh(tmp1, tmp1); // v2 + v1, v2 + v1, v3 + v0, v3 + v0 auto tmp3 = vec_add(tmp1, tmp2); return vec_extract(tmp3, 0); } From db1912344187aca4a1b58dc3a83df081ab895089 Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Mon, 14 Jul 2025 08:56:41 +0200 Subject: [PATCH 25/57] WIP --- include/xsimd/arch/xsimd_altivec.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp index 9312c8602..61cc96d29 100644 --- a/include/xsimd/arch/xsimd_altivec.hpp +++ b/include/xsimd/arch/xsimd_altivec.hpp @@ -85,14 +85,14 @@ namespace xsimd template ::value, void>::type> XSIMD_INLINE bool all(batch_bool const& self, requires_arch) noexcept { - return vec_all_ne(self, vec_xor(self, self)); + return vec_all_ne(self, vec_xor(self.data, self.data)); } // any template ::value, void>::type> XSIMD_INLINE bool any(batch_bool const& self, requires_arch) noexcept { - return vec_any_ne(self, vec_xor(self, self)); + return vec_any_ne(self, vec_xor(self.data, self.data)); } // avgr @@ -250,7 +250,7 @@ namespace xsimd // div template ::value, void>::type> - XSIMD_INLINE batch div(batch const& self, batch const& other, requires_arch) noexcept + XSIMD_INLINE batch div(batch const& self, batch const& other, requires_arch) noexcept { return vec_div(self, other); } @@ -740,7 +740,7 @@ namespace xsimd template ::value, void>::type> XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept { - return vec_sel(true_br, false_br, cond); + return vec_sel(true_br.data, false_br.data, cond.data); } template ::value, void>::type> XSIMD_INLINE batch select(batch_bool_constant const&, batch const& true_br, batch const& false_br, requires_arch) noexcept From 308d4d597cee335ea239e895c3c28af10bfba444 Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Mon, 14 Jul 2025 09:28:25 +0200 Subject: [PATCH 26/57] WIP --- include/xsimd/arch/xsimd_altivec.hpp | 106 +++++++++++++-------------- 1 file changed, 53 insertions(+), 53 deletions(-) diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp index 61cc96d29..8d83a7bd1 100644 --- a/include/xsimd/arch/xsimd_altivec.hpp +++ b/include/xsimd/arch/xsimd_altivec.hpp @@ -85,21 +85,21 @@ namespace xsimd template ::value, void>::type> XSIMD_INLINE bool all(batch_bool const& self, requires_arch) noexcept { - return vec_all_ne(self, vec_xor(self.data, self.data)); + return vec_all_ne(self.data, vec_xor(self.data, self.data)); } // any template ::value, void>::type> XSIMD_INLINE bool any(batch_bool const& self, requires_arch) noexcept { - return vec_any_ne(self, vec_xor(self.data, self.data)); + return vec_any_ne(self.data, vec_xor(self.data, self.data)); } // avgr template ::value, void>::type> XSIMD_INLINE batch avgr(batch const& self, batch const& other, requires_arch) noexcept { - return vec_avg(self, other); + return vec_avg(self.data, other.data); } // avg @@ -108,7 +108,7 @@ namespace xsimd { constexpr auto nbit = 8 * sizeof(T) - 1; constexpr auto adj = ((self ^ other) << nbit) >> nbit; - return avgr(self, other, A {}) - adj; + return avgr(self.data, other.data, A {}) - adj; } // batch_bool_cast @@ -134,12 +134,12 @@ namespace xsimd template ::value, void>::type> XSIMD_INLINE batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept { - return vec_nand(self, other); + return vec_nand(self.data, other.data); } template ::value, void>::type> XSIMD_INLINE batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { - return vec_nand(self, other); + return vec_nand(self.data, other.data); } // bitwise_lshift @@ -148,31 +148,31 @@ namespace xsimd { using shift_type = as_unsigned_integer_t; batch shift(static_cast(other)); - return vec_sl(self, shift); + return vec_sl(self.data, shift.data); } // bitwise_not template ::value, void>::type> XSIMD_INLINE batch bitwise_not(batch const& self, requires_arch) noexcept { - return vec_nor(self, self); + return vec_nor(self.data, self.data); } template ::value, void>::type> XSIMD_INLINE batch_bool bitwise_not(batch_bool const& self, requires_arch) noexcept { - return vec_nor(self, self); + return vec_nor(self.data, self.data); } // bitwise_or template ::value, void>::type> XSIMD_INLINE batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept { - return vec_or(self, other); + return vec_or(self.data, other.data); } template ::value, void>::type> XSIMD_INLINE batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { - return vec_or(self, other); + return vec_or(self.data, other.data); } // bitwise_rshift @@ -181,19 +181,19 @@ namespace xsimd { using shift_type = as_unsigned_integer_t; batch shift(static_cast(other)); - return vec_sr(self, shift); + return vec_sr(self.data, shift.data); } // bitwise_xor template ::value, void>::type> XSIMD_INLINE batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept { - return vec_xor(self, other); + return vec_xor(self.data, other.data); } template ::value, void>::type> XSIMD_INLINE batch bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { - return vec_xor(self, other); + return vec_xor(self.data, other.data); } // bitwise_cast @@ -252,7 +252,7 @@ namespace xsimd template ::value, void>::type> XSIMD_INLINE batch div(batch const& self, batch const& other, requires_arch) noexcept { - return vec_div(self, other); + return vec_div(self.data, other.data); } // fast_cast @@ -300,7 +300,7 @@ namespace xsimd template ::value, void>::type> XSIMD_INLINE T first(batch const& self, requires_arch) noexcept { - return vec_extract(self, 0); + return vec_extract(self.data, 0); } #if 0 @@ -404,36 +404,36 @@ namespace xsimd template ::value, void>::type> XSIMD_INLINE batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept { - return vec_cmpge(self, other); + return vec_cmpge(self.data, other.data); } - template - XSIMD_INLINE batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept + template ::value, void>::type> + XSIMD_INLINE batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept { - return vec_cmpge(self, other); + return vec_cmpge(self.data, other.data); } // gt template ::value, void>::type> XSIMD_INLINE batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept { - return vec_cmpgt(self, other); + return vec_cmpgt(self.data, other.data); } - template - XSIMD_INLINE batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept + template ::value, void>::type> + XSIMD_INLINE batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept { - return vec_cmpgt(self, other); + return vec_cmpgt(self.data, other.data); } // haddp template XSIMD_INLINE batch haddp(batch const* row, requires_arch) noexcept { - auto tmp0 = vec_mergee(row[0], row[1]); // v00 v10 v02 v12 - auto tmp1 = vec_mergeo(row[0], row[1]); // v01 v11 v03 v13 + auto tmp0 = vec_mergee(row[0].data, row[1].data); // v00 v10 v02 v12 + auto tmp1 = vec_mergeo(row[0].data, row[1].data); // v01 v11 v03 v13 auto tmp4 = vec_add(tmp0, tmp1); // (v00 + v01, v10 + v11, v02 + v03, v12 + v13) - auto tmp2 = vec_mergee(row[2], row[3]); // v20 v30 v22 v32 - auto tmp3 = vec_mergeo(row[2], row[3]); // v21 v31 v23 v33 + auto tmp2 = vec_mergee(row[2].data, row[3].data); // v20 v30 v22 v32 + auto tmp3 = vec_mergeo(row[2].data, row[3].data); // v21 v31 v23 v33 auto tmp5 = vec_add(tmp0, tmp1); // (v20 + v21, v30 + v31, v22 + v23, v32 + v33) auto tmp6 = vec_permi(tmp4, tmp5, 0x0); // (v00 + v01, v10 + v11, v20 + v21, v30 + v31 @@ -453,14 +453,14 @@ namespace xsimd template ::value, void>::type> XSIMD_INLINE batch insert(batch const& self, T val, index pos, requires_arch) noexcept { - return vec_insert(val, self, pos); + return vec_insert(val, self.data, pos); } // isnan template XSIMD_INLINE batch_bool isnan(batch const& self, requires_arch) noexcept { - return ~vec_cmpeq(self, self); + return ~vec_cmpeq(self.data, self.data); } // load_aligned @@ -501,22 +501,22 @@ namespace xsimd template ::value, void>::type> XSIMD_INLINE batch_bool le(batch const& self, batch const& other, requires_arch) noexcept { - return vec_cmple(self, other); + return vec_cmple(self.data, other.data); } - template - XSIMD_INLINE batch_bool le(batch const& self, batch const& other, requires_arch) noexcept + template ::value, void>::type> + XSIMD_INLINE batch_bool le(batch const& self, batch const& other, requires_arch) noexcept { - return vec_cmple(self, other); + return vec_cmple(self.data, other.data); } // lt template ::value, void>::type> XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept { - return vec_cmplt(self, other); + return vec_cmplt(self.data, other.data); } - template - XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept + template ::value, void>::type> + XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept { return vec_cmplt(self, other); } @@ -601,21 +601,21 @@ namespace xsimd template ::value, void>::type> XSIMD_INLINE batch max(batch const& self, batch const& other, requires_arch) noexcept { - return vec_max(self, other); + return vec_max(self.data, other.data); } // min template ::value, void>::type> XSIMD_INLINE batch min(batch const& self, batch const& other, requires_arch) noexcept { - return vec_min(self, other); + return vec_min(self.data, other.data); } // mul template ::value, void>::type> XSIMD_INLINE batch mul(batch const& self, batch const& other, requires_arch) noexcept { - return vec_mul(self, other); + return vec_mul(self.data, other.data); } #if 0 @@ -632,19 +632,19 @@ namespace xsimd template ::value, void>::type> XSIMD_INLINE batch neg(batch const& self, requires_arch) noexcept { - return vec_neg(self); + return vec_neg(self.data); } // neq template ::value, void>::type> XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept { - return ~vec_cmpeq(self, other); + return ~vec_cmpeq(self.data, other.data); } template ::value, void>::type> XSIMD_INLINE batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { - return ~vec_cmpeq(self, other); + return ~vec_cmpeq(self.data, other.data); } // reciprocal @@ -652,7 +652,7 @@ namespace xsimd XSIMD_INLINE batch reciprocal(batch const& self, kernel::requires_arch) { - return vec_re(self); + return vec_re(self.data); } // reduce_add @@ -733,7 +733,7 @@ namespace xsimd template XSIMD_INLINE batch rsqrt(batch const& val, requires_arch) noexcept { - return vec_rsqrt(val); + return vec_rsqrt(val.data); } // select @@ -783,28 +783,28 @@ namespace xsimd template XSIMD_INLINE batch sqrt(batch const& val, requires_arch) noexcept { - return vec_sqrt(val); + return vec_sqrt(val.data); } // slide_left template XSIMD_INLINE batch slide_left(batch const& x, requires_arch) noexcept { - return vec_sll(x, vec_splat_u8(N)); + return vec_sll(x.data, vec_splat_u8(N)); } // slide_right template XSIMD_INLINE batch slide_right(batch const& x, requires_arch) noexcept { - return vec_srl(x, vec_splat_u8(N)); + return vec_srl(x.data, vec_splat_u8(N)); } // sadd template ::value, void>::type> XSIMD_INLINE batch sadd(batch const& self, batch const& other, requires_arch) noexcept { - return vec_adds(self, other); + return vec_adds(self.data, other.data); } // set @@ -828,7 +828,7 @@ namespace xsimd { XSIMD_IF_CONSTEXPR(sizeof(T) == 1) { - return vec_subs(self, other); + return vec_subs(self.data, other.data); } else { @@ -861,7 +861,7 @@ namespace xsimd template ::value, void>::type> XSIMD_INLINE batch sub(batch const& self, batch const& other, requires_arch) noexcept { - return vec_sub(self, other); + return vec_sub(self.data, other.data); } #if 0 @@ -981,14 +981,14 @@ namespace xsimd template ::value, void>::type> XSIMD_INLINE batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept { - return vec_merge_hi(self, other); + return vec_merge_hi(self.data, other.data); } // zip_lo template ::value, void>::type> XSIMD_INLINE batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept { - return vec_mergel(self, other); + return vec_mergel(self.data, other.data); } } } From 8c7d552c8fef5c3dba79ade14dcba683abc41c2c Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Mon, 14 Jul 2025 10:12:16 +0200 Subject: [PATCH 27/57] WIP --- include/xsimd/arch/xsimd_altivec.hpp | 20 -------------------- 1 file changed, 20 deletions(-) diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp index 8d83a7bd1..97d12586c 100644 --- a/include/xsimd/arch/xsimd_altivec.hpp +++ b/include/xsimd/arch/xsimd_altivec.hpp @@ -406,11 +406,6 @@ namespace xsimd { return vec_cmpge(self.data, other.data); } - template ::value, void>::type> - XSIMD_INLINE batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept - { - return vec_cmpge(self.data, other.data); - } // gt template ::value, void>::type> @@ -418,11 +413,6 @@ namespace xsimd { return vec_cmpgt(self.data, other.data); } - template ::value, void>::type> - XSIMD_INLINE batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept - { - return vec_cmpgt(self.data, other.data); - } // haddp template @@ -503,11 +493,6 @@ namespace xsimd { return vec_cmple(self.data, other.data); } - template ::value, void>::type> - XSIMD_INLINE batch_bool le(batch const& self, batch const& other, requires_arch) noexcept - { - return vec_cmple(self.data, other.data); - } // lt template ::value, void>::type> @@ -515,11 +500,6 @@ namespace xsimd { return vec_cmplt(self.data, other.data); } - template ::value, void>::type> - XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept - { - return vec_cmplt(self, other); - } #if 0 From ca1fd743dd5d59dbeedc3526677e68bd2aaa8d1b Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Mon, 14 Jul 2025 11:07:51 +0200 Subject: [PATCH 28/57] WIP --- include/xsimd/arch/xsimd_altivec.hpp | 2 +- .../xsimd/types/xsimd_altivec_register.hpp | 27 +++++++++++++------ 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp index 97d12586c..94fc01e0f 100644 --- a/include/xsimd/arch/xsimd_altivec.hpp +++ b/include/xsimd/arch/xsimd_altivec.hpp @@ -252,7 +252,7 @@ namespace xsimd template ::value, void>::type> XSIMD_INLINE batch div(batch const& self, batch const& other, requires_arch) noexcept { - return vec_div(self.data, other.data); + return vec_mul(self.data, vec_re(other.data)); } // fast_cast diff --git a/include/xsimd/types/xsimd_altivec_register.hpp b/include/xsimd/types/xsimd_altivec_register.hpp index 52f896bf2..cf15a3f9f 100644 --- a/include/xsimd/types/xsimd_altivec_register.hpp +++ b/include/xsimd/types/xsimd_altivec_register.hpp @@ -38,14 +38,25 @@ namespace xsimd #if XSIMD_WITH_ALTIVEC namespace types { - XSIMD_DECLARE_SIMD_REGISTER(signed char, altivec, __vector signed char); - XSIMD_DECLARE_SIMD_REGISTER(unsigned char, altivec, __vector unsigned char); - XSIMD_DECLARE_SIMD_REGISTER(char, altivec, __vector char); - XSIMD_DECLARE_SIMD_REGISTER(unsigned short, altivec, __vector unsigned short); - XSIMD_DECLARE_SIMD_REGISTER(short, altivec, __vector short); - XSIMD_DECLARE_SIMD_REGISTER(unsigned int, altivec, __vector unsigned int); - XSIMD_DECLARE_SIMD_REGISTER(int, altivec, __vector int); - XSIMD_DECLARE_SIMD_REGISTER(float, altivec, __vector float); + +#define XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER(T, Tb) \ + template <> \ + struct get_bool_simd_register \ + { \ + using type = __vector __bool Tb; \ + }; \ + XSIMD_DECLARE_SIMD_REGISTER(T, altivec, __vector T) + + XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER(signed char, char); + XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER(unsigned char, char); + XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER(char, char); + XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER(unsigned short, short); + XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER(short, short); + XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER(unsigned int, int); + XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER(int, int); + XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER(float, float); + +#undef XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER } #endif } From 8602e056dcff88e1b1d9b5db47f045c5a4239d45 Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Tue, 15 Jul 2025 01:14:24 +0200 Subject: [PATCH 29/57] WIP --- include/xsimd/arch/xsimd_altivec.hpp | 3 ++- .../xsimd/types/xsimd_altivec_register.hpp | 24 +++++++++++++------ 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp index 94fc01e0f..9310838f2 100644 --- a/include/xsimd/arch/xsimd_altivec.hpp +++ b/include/xsimd/arch/xsimd_altivec.hpp @@ -798,7 +798,8 @@ namespace xsimd template ::value, void>::type> XSIMD_INLINE batch_bool set(batch_bool const&, requires_arch, Values... values) noexcept { - return set(batch(), A {}, static_cast(values ? -1LL : 0LL)...).data; + static_assert(sizeof...(Values) == batch_bool::size, "consistent init"); + return typename batch_bool::register_type { static_cast::register_type>()[0])>(values ? -1LL : 0LL)... }; } // ssub diff --git a/include/xsimd/types/xsimd_altivec_register.hpp b/include/xsimd/types/xsimd_altivec_register.hpp index cf15a3f9f..2ed7d89ee 100644 --- a/include/xsimd/types/xsimd_altivec_register.hpp +++ b/include/xsimd/types/xsimd_altivec_register.hpp @@ -39,12 +39,22 @@ namespace xsimd namespace types { -#define XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER(T, Tb) \ - template <> \ - struct get_bool_simd_register \ - { \ - using type = __vector __bool Tb; \ - }; \ +#define XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER(T, Tb) \ + template <> \ + struct get_bool_simd_register \ + { \ + struct type \ + { \ + using register_type = __vector __bool Tb; \ + register_type data; \ + type() = default; \ + type(register_type r) \ + : data(r) \ + { \ + } \ + operator register_type() const noexcept { return data; } \ + }; \ + }; \ XSIMD_DECLARE_SIMD_REGISTER(T, altivec, __vector T) XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER(signed char, char); @@ -54,7 +64,7 @@ namespace xsimd XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER(short, short); XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER(unsigned int, int); XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER(int, int); - XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER(float, float); + XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER(float, int); #undef XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER } From 4f4092d1954707b145635797e355eb0ad73ad91c Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Tue, 15 Jul 2025 12:18:21 +0200 Subject: [PATCH 30/57] WIP --- include/xsimd/config/xsimd_inline.hpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/include/xsimd/config/xsimd_inline.hpp b/include/xsimd/config/xsimd_inline.hpp index 88e9cbcd0..33dba1033 100644 --- a/include/xsimd/config/xsimd_inline.hpp +++ b/include/xsimd/config/xsimd_inline.hpp @@ -12,6 +12,10 @@ #ifndef XSIMD_INLINE_HPP #define XSIMD_INLINE_HPP +#if defined(__VEC__) +#define XSIMD_INLINE inline +#else + #if defined(__GNUC__) #define XSIMD_INLINE inline __attribute__((always_inline)) #elif defined(_MSC_VER) @@ -21,3 +25,5 @@ #endif #endif + +#endif From b7a286e62e2b041dec8dfb96d432ae761c3fbf3d Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Tue, 15 Jul 2025 12:33:38 +0200 Subject: [PATCH 31/57] WIP --- include/xsimd/arch/xsimd_altivec.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp index 9310838f2..649c79b35 100644 --- a/include/xsimd/arch/xsimd_altivec.hpp +++ b/include/xsimd/arch/xsimd_altivec.hpp @@ -249,8 +249,8 @@ namespace xsimd } // div - template ::value, void>::type> - XSIMD_INLINE batch div(batch const& self, batch const& other, requires_arch) noexcept + template + XSIMD_INLINE batch div(batch const& self, batch const& other, requires_arch) noexcept { return vec_mul(self.data, vec_re(other.data)); } From e68ad27d7dea75b8e8dad13c280b2dd534f92f29 Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Tue, 15 Jul 2025 12:36:58 +0200 Subject: [PATCH 32/57] gcc ver --- .github/workflows/cross-ppc.yml | 4 ++-- include/xsimd/arch/xsimd_altivec.hpp | 17 +++++++++-------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/.github/workflows/cross-ppc.yml b/.github/workflows/cross-ppc.yml index ec7a11f04..f1617b88b 100644 --- a/.github/workflows/cross-ppc.yml +++ b/.github/workflows/cross-ppc.yml @@ -10,9 +10,9 @@ jobs: strategy: matrix: target: - - { platform: 'ppc', dir: 'powerpc-linux-gnu', flags: '-maltivec', full: 'OFF' } + - { platform: 'ppc', dir: 'powerpc-linux-gnu', flags: '-maltivec -mvsx', full: 'OFF' } sys: - - { compiler: 'gcc', version: '10' } + - { compiler: 'gcc', version: '12' } steps: - name: Setup compiler if: ${{ matrix.sys.compiler == 'gcc' }} diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp index 649c79b35..509c76646 100644 --- a/include/xsimd/arch/xsimd_altivec.hpp +++ b/include/xsimd/arch/xsimd_altivec.hpp @@ -71,7 +71,7 @@ namespace xsimd template XSIMD_INLINE batch abs(batch const& self, requires_arch) noexcept { - return vec_abs(self); + return vec_abs(self.data); } // add @@ -401,14 +401,14 @@ namespace xsimd } #endif // ge - template ::value, void>::type> + template ::value, void>::type> XSIMD_INLINE batch_bool ge(batch const& self, batch const& other, requires_arch) noexcept { return vec_cmpge(self.data, other.data); } // gt - template ::value, void>::type> + template ::value, void>::type> XSIMD_INLINE batch_bool gt(batch const& self, batch const& other, requires_arch) noexcept { return vec_cmpgt(self.data, other.data); @@ -440,7 +440,7 @@ namespace xsimd } // insert - template ::value, void>::type> + template ::value, void>::type> XSIMD_INLINE batch insert(batch const& self, T val, index pos, requires_arch) noexcept { return vec_insert(val, self.data, pos); @@ -488,14 +488,14 @@ namespace xsimd #endif // le - template ::value, void>::type> + template ::value, void>::type> XSIMD_INLINE batch_bool le(batch const& self, batch const& other, requires_arch) noexcept { return vec_cmple(self.data, other.data); } // lt - template ::value, void>::type> + template ::value, void>::type> XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept { return vec_cmplt(self.data, other.data); @@ -592,10 +592,11 @@ namespace xsimd } // mul - template ::value, void>::type> + template ::value, void>::type> XSIMD_INLINE batch mul(batch const& self, batch const& other, requires_arch) noexcept { - return vec_mul(self.data, other.data); + return self.data * other.data; + // return vec_mul(self.data, other.data); } #if 0 From 1040fef6482160b84c86f3128bf79d9860d8f207 Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Wed, 16 Jul 2025 09:46:11 +0200 Subject: [PATCH 33/57] WIP --- include/xsimd/arch/xsimd_altivec.hpp | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp index 509c76646..f3451681a 100644 --- a/include/xsimd/arch/xsimd_altivec.hpp +++ b/include/xsimd/arch/xsimd_altivec.hpp @@ -66,6 +66,8 @@ namespace xsimd template XSIMD_INLINE batch avgr(batch const&, batch const&, requires_arch) noexcept; #endif + template + XSIMD_INLINE batch ssub(batch const&, batch const&, requires_arch) noexcept; // abs template @@ -103,13 +105,18 @@ namespace xsimd } // avg - template ::value, void>::type> + template ::value, void>::type> XSIMD_INLINE batch avg(batch const& self, batch const& other, requires_arch) noexcept { constexpr auto nbit = 8 * sizeof(T) - 1; - constexpr auto adj = ((self ^ other) << nbit) >> nbit; + auto adj = ((self ^ other) << nbit) >> nbit; return avgr(self.data, other.data, A {}) - adj; } + template + XSIMD_INLINE batch avg(batch const& self, batch const& other, requires_arch) noexcept + { + return vec_avg(self.data, other.data); + } // batch_bool_cast template @@ -613,7 +620,7 @@ namespace xsimd template ::value, void>::type> XSIMD_INLINE batch neg(batch const& self, requires_arch) noexcept { - return vec_neg(self.data); + return -(self.data); } // neq From 1c231958841e1fd522f6a7e8595a7e2341ae3da2 Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Wed, 16 Jul 2025 11:37:00 +0200 Subject: [PATCH 34/57] WIP --- include/xsimd/arch/xsimd_altivec.hpp | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp index f3451681a..0c407db5f 100644 --- a/include/xsimd/arch/xsimd_altivec.hpp +++ b/include/xsimd/arch/xsimd_altivec.hpp @@ -66,8 +66,6 @@ namespace xsimd template XSIMD_INLINE batch avgr(batch const&, batch const&, requires_arch) noexcept; #endif - template - XSIMD_INLINE batch ssub(batch const&, batch const&, requires_arch) noexcept; // abs template @@ -98,7 +96,7 @@ namespace xsimd } // avgr - template ::value, void>::type> + template ::value, void>::type> XSIMD_INLINE batch avgr(batch const& self, batch const& other, requires_arch) noexcept { return vec_avg(self.data, other.data); @@ -112,11 +110,6 @@ namespace xsimd auto adj = ((self ^ other) << nbit) >> nbit; return avgr(self.data, other.data, A {}) - adj; } - template - XSIMD_INLINE batch avg(batch const& self, batch const& other, requires_arch) noexcept - { - return vec_avg(self.data, other.data); - } // batch_bool_cast template @@ -812,17 +805,10 @@ namespace xsimd // ssub - template ::value, void>::type> + template ::value && sizeof(T) == 1, void>::type> XSIMD_INLINE batch ssub(batch const& self, batch const& other, requires_arch) noexcept { - XSIMD_IF_CONSTEXPR(sizeof(T) == 1) - { - return vec_subs(self.data, other.data); - } - else - { - return ssub(self, other, common {}); - } + return vec_subs(self.data, other.data); } // store_aligned From 62b9257abcca955356e6dc9737ed63f74285a5b8 Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Wed, 16 Jul 2025 11:48:29 +0200 Subject: [PATCH 35/57] WIP --- include/xsimd/arch/xsimd_altivec.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp index 0c407db5f..4d7a1e76c 100644 --- a/include/xsimd/arch/xsimd_altivec.hpp +++ b/include/xsimd/arch/xsimd_altivec.hpp @@ -108,7 +108,7 @@ namespace xsimd { constexpr auto nbit = 8 * sizeof(T) - 1; auto adj = ((self ^ other) << nbit) >> nbit; - return avgr(self.data, other.data, A {}) - adj; + return avgr(self, other, A {}) - adj; } // batch_bool_cast @@ -782,7 +782,7 @@ namespace xsimd } // sadd - template ::value, void>::type> + template ::value, void>::type> XSIMD_INLINE batch sadd(batch const& self, batch const& other, requires_arch) noexcept { return vec_adds(self.data, other.data); From 37548ca85aa1897412cb479c058f5c40c3b55e19 Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Wed, 16 Jul 2025 19:04:21 +0200 Subject: [PATCH 36/57] WIP --- include/xsimd/arch/xsimd_altivec.hpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp index 4d7a1e76c..e84aa25f2 100644 --- a/include/xsimd/arch/xsimd_altivec.hpp +++ b/include/xsimd/arch/xsimd_altivec.hpp @@ -134,12 +134,12 @@ namespace xsimd template ::value, void>::type> XSIMD_INLINE batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept { - return vec_nand(self.data, other.data); + return self.data & ~other.data; } - template ::value, void>::type> + template ::value, void>::type> XSIMD_INLINE batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { - return vec_nand(self.data, other.data); + return self.data & ~other.data; } // bitwise_lshift @@ -191,7 +191,7 @@ namespace xsimd return vec_xor(self.data, other.data); } template ::value, void>::type> - XSIMD_INLINE batch bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + XSIMD_INLINE batch_bool bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept { return vec_xor(self.data, other.data); } From 7e1d26a698ec775423bdfab0b68d8b6f93490fc7 Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Wed, 16 Jul 2025 23:36:21 +0200 Subject: [PATCH 37/57] double --- test/test_batch_cast.cpp | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/test/test_batch_cast.cpp b/test/test_batch_cast.cpp index 8a638ed24..5d84176d3 100644 --- a/test/test_batch_cast.cpp +++ b/test/test_batch_cast.cpp @@ -70,15 +70,18 @@ struct batch_cast_test using uint16_batch = xsimd::batch; using int32_batch = xsimd::batch; using uint32_batch = xsimd::batch; + using float_batch = xsimd::batch; +#ifndef XSIMD_WITH_ALTIVEC using int64_batch = xsimd::batch; using uint64_batch = xsimd::batch; - using float_batch = xsimd::batch; using double_batch = xsimd::batch; +#endif std::vector int_test_values; - std::vector float_test_values; std::vector double_test_values; + std::vector float_test_values; + batch_cast_test() { int_test_values = { @@ -182,12 +185,14 @@ struct batch_cast_test test_cast_impl(test_value, "batch cast uint32 -> uint32"); test_cast_impl(test_value, "batch cast uint32 -> float"); +#ifndef XSIMD_WITH_ALTIVEC test_cast_impl(test_value, "batch cast int64 -> int64"); test_cast_impl(test_value, "batch cast int64 -> uint64"); test_cast_impl(test_value, "batch cast int64 -> double"); test_cast_impl(test_value, "batch cast uint64 -> int64"); test_cast_impl(test_value, "batch cast uint64 -> uint64"); test_cast_impl(test_value, "batch cast uint64 -> double"); +#endif } for (const auto& test_value : float_test_values) @@ -197,12 +202,14 @@ struct batch_cast_test test_cast_impl(test_value, "batch cast float -> float"); } +#ifndef XSIMD_WITH_ALTIVEC for (const auto& test_value : double_test_values) { test_cast_impl(test_value, "batch cast double -> int64"); test_cast_impl(test_value, "batch cast double -> uint64"); test_cast_impl(test_value, "batch cast double -> double"); } +#endif } #if 0 && XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX_VERSION From 35aa9e7733758c20f552d155d2daf65dbcb5a8d4 Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Wed, 16 Jul 2025 23:59:07 +0200 Subject: [PATCH 38/57] WIP --- include/xsimd/arch/xsimd_altivec.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp index e84aa25f2..86373b072 100644 --- a/include/xsimd/arch/xsimd_altivec.hpp +++ b/include/xsimd/arch/xsimd_altivec.hpp @@ -115,7 +115,7 @@ namespace xsimd template XSIMD_INLINE batch_bool batch_bool_cast(batch_bool const& self, batch_bool const&, requires_arch) noexcept { - return { bitwise_cast(batch(self.data)).data }; + return (typename batch_bool::register_type)self.data; } // bitwise_and From abfece217311cb39f267a509b1896c575b27b012 Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Thu, 17 Jul 2025 00:14:56 +0200 Subject: [PATCH 39/57] WIP --- .github/workflows/cross-ppc.yml | 2 +- include/xsimd/arch/xsimd_altivec.hpp | 19 ++------- test/CMakeLists.txt | 58 ++++++++++++++-------------- 3 files changed, 33 insertions(+), 46 deletions(-) diff --git a/.github/workflows/cross-ppc.yml b/.github/workflows/cross-ppc.yml index f1617b88b..70695b0c6 100644 --- a/.github/workflows/cross-ppc.yml +++ b/.github/workflows/cross-ppc.yml @@ -39,5 +39,5 @@ jobs: run: cmake --build _build --verbose -j1 - name: Testing xsimd run: | - qemu-${{ matrix.target.platform }} -cpu 7400 -L /usr/${{ matrix.target.dir}}/ ./test/test_xsimd + qemu-${{ matrix.target.platform }} -cpu 7457 -L /usr/${{ matrix.target.dir}}/ ./test/test_xsimd working-directory: ${{ github.workspace }}/_build diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp index 86373b072..27d54d9df 100644 --- a/include/xsimd/arch/xsimd_altivec.hpp +++ b/include/xsimd/arch/xsimd_altivec.hpp @@ -209,7 +209,6 @@ namespace xsimd { return vec_splats(val); } -#if 0 // store_complex namespace detail @@ -219,28 +218,16 @@ namespace xsimd template XSIMD_INLINE batch complex_low(batch, A> const& self, requires_arch) noexcept { - return _mm_unpacklo_ps(self.real(), self.imag()); + return vec_mergel(self.real().data, self.imag().data); } // complex_high template XSIMD_INLINE batch complex_high(batch, A> const& self, requires_arch) noexcept { - return _mm_unpackhi_ps(self.real(), self.imag()); - } - template - XSIMD_INLINE batch complex_low(batch, A> const& self, requires_arch) noexcept - { - return _mm_unpacklo_pd(self.real(), self.imag()); - } - template - XSIMD_INLINE batch complex_high(batch, A> const& self, requires_arch) noexcept - { - return _mm_unpackhi_pd(self.real(), self.imag()); + return vec_mergeh(self.real().data, self.imag().data); } } -#endif - // decr_if template ::value, void>::type> XSIMD_INLINE batch decr_if(batch const& self, batch_bool const& mask, requires_arch) noexcept @@ -956,7 +943,7 @@ namespace xsimd template ::value, void>::type> XSIMD_INLINE batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept { - return vec_merge_hi(self.data, other.data); + return vec_mergeh(self.data, other.data); } // zip_lo diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index e6bad7999..95ab8d2af 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -131,39 +131,39 @@ set(XSIMD_TESTS main.cpp test_api.cpp test_arch.cpp - test_basic_math.cpp + #test_basic_math.cpp test_batch.cpp test_batch_bool.cpp test_batch_cast.cpp - test_batch_complex.cpp - test_batch_float.cpp - test_batch_int.cpp - test_bitwise_cast.cpp - test_batch_constant.cpp - test_batch_manip.cpp - test_complex_exponential.cpp - test_complex_hyperbolic.cpp - test_complex_power.cpp - test_complex_trigonometric.cpp - test_conversion.cpp - test_custom_default_arch.cpp - test_error_gamma.cpp - test_explicit_batch_instantiation.cpp - test_exponential.cpp - test_extract_pair.cpp - test_fp_manipulation.cpp - test_hyperbolic.cpp - test_load_store.cpp - test_memory.cpp - test_poly_evaluation.cpp - test_power.cpp - test_rounding.cpp - test_select.cpp - test_shuffle.cpp + # test_batch_complex.cpp + # test_batch_float.cpp + # test_batch_int.cpp + # test_bitwise_cast.cpp + # test_batch_constant.cpp + # test_batch_manip.cpp + # test_complex_exponential.cpp + # test_complex_hyperbolic.cpp + # test_complex_power.cpp + # test_complex_trigonometric.cpp + # test_conversion.cpp + # test_custom_default_arch.cpp + # test_error_gamma.cpp + # test_explicit_batch_instantiation.cpp + # test_exponential.cpp + # test_extract_pair.cpp + # test_fp_manipulation.cpp + # test_hyperbolic.cpp + # test_load_store.cpp + # test_memory.cpp + # test_poly_evaluation.cpp + # test_power.cpp + # test_rounding.cpp + # test_select.cpp + # test_shuffle.cpp test_sum.cpp - test_traits.cpp - test_trigonometric.cpp - test_xsimd_api.cpp + # test_traits.cpp + # test_trigonometric.cpp + # test_xsimd_api.cpp test_utils.hpp ) From 4e8638a3b8fbb0fff534ede003e881d75489eda6 Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Thu, 17 Jul 2025 10:46:22 +0200 Subject: [PATCH 40/57] WIP --- test/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 95ab8d2af..8bb46afda 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -135,7 +135,7 @@ set(XSIMD_TESTS test_batch.cpp test_batch_bool.cpp test_batch_cast.cpp - # test_batch_complex.cpp + test_batch_complex.cpp # test_batch_float.cpp # test_batch_int.cpp # test_bitwise_cast.cpp From bdb295bf7be44ff155c3d3ea3bcf7ae7f35ee7f7 Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Thu, 17 Jul 2025 10:57:41 +0200 Subject: [PATCH 41/57] WIP --- .github/toolchains/gcc-powerpc-linux-gnu.cmake | 5 ----- .github/toolchains/gcc-powerpc64-linux-gnu.cmake | 5 +++++ .github/workflows/cross-ppc.yml | 4 ++-- include/xsimd/arch/xsimd_altivec.hpp | 10 +--------- test/CMakeLists.txt | 2 +- 5 files changed, 9 insertions(+), 17 deletions(-) delete mode 100644 .github/toolchains/gcc-powerpc-linux-gnu.cmake create mode 100644 .github/toolchains/gcc-powerpc64-linux-gnu.cmake diff --git a/.github/toolchains/gcc-powerpc-linux-gnu.cmake b/.github/toolchains/gcc-powerpc-linux-gnu.cmake deleted file mode 100644 index a318f6412..000000000 --- a/.github/toolchains/gcc-powerpc-linux-gnu.cmake +++ /dev/null @@ -1,5 +0,0 @@ -set(CMAKE_SYSTEM_PROCESSOR powerpc) -set(triple powerpc-linux-gnu) - -include(${CMAKE_CURRENT_LIST_DIR}/gcc.cmake) - diff --git a/.github/toolchains/gcc-powerpc64-linux-gnu.cmake b/.github/toolchains/gcc-powerpc64-linux-gnu.cmake new file mode 100644 index 000000000..5dd97d6c6 --- /dev/null +++ b/.github/toolchains/gcc-powerpc64-linux-gnu.cmake @@ -0,0 +1,5 @@ +set(CMAKE_SYSTEM_PROCESSOR powerpc64) +set(triple powerpc64-linux-gnu) + +include(${CMAKE_CURRENT_LIST_DIR}/gcc.cmake) + diff --git a/.github/workflows/cross-ppc.yml b/.github/workflows/cross-ppc.yml index 70695b0c6..49114b7bb 100644 --- a/.github/workflows/cross-ppc.yml +++ b/.github/workflows/cross-ppc.yml @@ -10,7 +10,7 @@ jobs: strategy: matrix: target: - - { platform: 'ppc', dir: 'powerpc-linux-gnu', flags: '-maltivec -mvsx', full: 'OFF' } + - { platform: 'ppc64', dir: 'powerpc64-linux-gnu', flags: '-maltivec -mvsx -mcpu=power8', full: 'OFF' } sys: - { compiler: 'gcc', version: '12' } steps: @@ -39,5 +39,5 @@ jobs: run: cmake --build _build --verbose -j1 - name: Testing xsimd run: | - qemu-${{ matrix.target.platform }} -cpu 7457 -L /usr/${{ matrix.target.dir}}/ ./test/test_xsimd + #qemu-${{ matrix.target.platform }} -cpu power8 -L /usr/${{ matrix.target.dir}}/ ./test/test_xsimd working-directory: ${{ github.workspace }}/_build diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp index 27d54d9df..891c266f7 100644 --- a/include/xsimd/arch/xsimd_altivec.hpp +++ b/include/xsimd/arch/xsimd_altivec.hpp @@ -456,23 +456,15 @@ namespace xsimd return vec_perm(lo, hi, vec_lvsl(0, mem)); } -#if 0 // load_complex namespace detail { - // Redefine these methods in the SSE-based archs if required template XSIMD_INLINE batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) noexcept { - return { _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(2, 0, 2, 0)), _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(3, 1, 3, 1)) }; - } - template - XSIMD_INLINE batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) noexcept - { - return { _mm_shuffle_pd(hi, lo, _MM_SHUFFLE2(0, 0)), _mm_shuffle_pd(hi, lo, _MM_SHUFFLE2(1, 1)) }; + return { vec_mergee(hi.data, lo.data), vec_mergeo(hi.data, lo.data) }; } } -#endif // le template ::value, void>::type> diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 8bb46afda..ac4af1406 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -131,7 +131,7 @@ set(XSIMD_TESTS main.cpp test_api.cpp test_arch.cpp - #test_basic_math.cpp + test_basic_math.cpp test_batch.cpp test_batch_bool.cpp test_batch_cast.cpp From de42edc3f981dcc5459866ca39badf7f0889ad05 Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Thu, 17 Jul 2025 15:25:57 +0200 Subject: [PATCH 42/57] WIP --- test/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index ac4af1406..cde63cd56 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -136,7 +136,7 @@ set(XSIMD_TESTS test_batch_bool.cpp test_batch_cast.cpp test_batch_complex.cpp - # test_batch_float.cpp + test_batch_float.cpp # test_batch_int.cpp # test_bitwise_cast.cpp # test_batch_constant.cpp From 82da5e5fa8f68712e897d09be65e7ac7b5e9a712 Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Fri, 18 Jul 2025 01:03:33 +0200 Subject: [PATCH 43/57] WIP --- include/xsimd/arch/xsimd_altivec.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp index 891c266f7..6d7b76e6a 100644 --- a/include/xsimd/arch/xsimd_altivec.hpp +++ b/include/xsimd/arch/xsimd_altivec.hpp @@ -413,8 +413,8 @@ namespace xsimd auto tmp3 = vec_mergeo(row[2].data, row[3].data); // v21 v31 v23 v33 auto tmp5 = vec_add(tmp0, tmp1); // (v20 + v21, v30 + v31, v22 + v23, v32 + v33) - auto tmp6 = vec_permi(tmp4, tmp5, 0x0); // (v00 + v01, v10 + v11, v20 + v21, v30 + v31 - auto tmp7 = vec_permi(tmp4, tmp5, 0x3); // (v02 + v03, v12 + v13, v12 + v13, v32 + v33) + auto tmp6 = vec_perm(tmp4, tmp5, (__vector unsigned char) { 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 }); // (v00 + v01, v10 + v11, v20 + v21, v30 + v31 + auto tmp7 = vec_perm(tmp4, tmp5, (__vector unsigned char) { 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31 }); // (v02 + v03, v12 + v13, v12 + v13, v32 + v33) return vec_add(tmp6, tmp7); } From ff1a2d6d6101b9b56321addecf6e58a0f02c3429 Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Fri, 18 Jul 2025 10:12:06 +0200 Subject: [PATCH 44/57] WIP --- test/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index cde63cd56..09c350a2a 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -137,9 +137,9 @@ set(XSIMD_TESTS test_batch_cast.cpp test_batch_complex.cpp test_batch_float.cpp - # test_batch_int.cpp + test_batch_int.cpp # test_bitwise_cast.cpp - # test_batch_constant.cpp + test_batch_constant.cpp # test_batch_manip.cpp # test_complex_exponential.cpp # test_complex_hyperbolic.cpp From 32becba2fe621f86571f6938b22eb436b6e13608 Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Fri, 18 Jul 2025 10:21:10 +0200 Subject: [PATCH 45/57] WIP --- test/CMakeLists.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 09c350a2a..e01e3684d 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -138,7 +138,7 @@ set(XSIMD_TESTS test_batch_complex.cpp test_batch_float.cpp test_batch_int.cpp - # test_bitwise_cast.cpp + test_bitwise_cast.cpp test_batch_constant.cpp # test_batch_manip.cpp # test_complex_exponential.cpp @@ -153,7 +153,7 @@ set(XSIMD_TESTS # test_extract_pair.cpp # test_fp_manipulation.cpp # test_hyperbolic.cpp - # test_load_store.cpp + test_load_store.cpp # test_memory.cpp # test_poly_evaluation.cpp # test_power.cpp @@ -161,8 +161,8 @@ set(XSIMD_TESTS # test_select.cpp # test_shuffle.cpp test_sum.cpp - # test_traits.cpp - # test_trigonometric.cpp + test_traits.cpp + test_trigonometric.cpp # test_xsimd_api.cpp test_utils.hpp ) From 4e8d2702d0902f8a0bb2b2d359aba8a916fdbc70 Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Fri, 18 Jul 2025 10:37:09 +0200 Subject: [PATCH 46/57] WIP --- include/xsimd/arch/xsimd_altivec.hpp | 2 +- test/CMakeLists.txt | 8 +- test/test_bitwise_cast.cpp | 108 ++++++++++++++++----------- 3 files changed, 71 insertions(+), 47 deletions(-) diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp index 6d7b76e6a..7cbb861c9 100644 --- a/include/xsimd/arch/xsimd_altivec.hpp +++ b/include/xsimd/arch/xsimd_altivec.hpp @@ -430,7 +430,7 @@ namespace xsimd template ::value, void>::type> XSIMD_INLINE batch insert(batch const& self, T val, index pos, requires_arch) noexcept { - return vec_insert(val, self.data, pos); + return vec_insert(val, self.data, I); } // isnan diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index e01e3684d..883d2b39f 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -141,10 +141,10 @@ set(XSIMD_TESTS test_bitwise_cast.cpp test_batch_constant.cpp # test_batch_manip.cpp - # test_complex_exponential.cpp - # test_complex_hyperbolic.cpp - # test_complex_power.cpp - # test_complex_trigonometric.cpp + test_complex_exponential.cpp + test_complex_hyperbolic.cpp + test_complex_power.cpp + test_complex_trigonometric.cpp # test_conversion.cpp # test_custom_default_arch.cpp # test_error_gamma.cpp diff --git a/test/test_bitwise_cast.cpp b/test/test_bitwise_cast.cpp index 59e19cfdb..8efedeadf 100644 --- a/test/test_bitwise_cast.cpp +++ b/test/test_bitwise_cast.cpp @@ -21,37 +21,44 @@ struct bitwise_cast_test static constexpr size_t N = CP::size; using int32_batch = xsimd::batch; - using int64_batch = xsimd::batch; using float_batch = xsimd::batch; - using double_batch = xsimd::batch; using int32_vector = std::vector>; - using int64_vector = std::vector>; using float_vector = std::vector>; - using double_vector = std::vector>; int32_vector ftoi32_res; - int32_vector dtoi32_res; - int64_vector ftoi64_res; - int64_vector dtoi64_res; float_vector i32tof_res; + +#ifndef XSIMD_WITH_ALTIVEC + using int64_batch = xsimd::batch; + using double_batch = xsimd::batch; + + using int64_vector = std::vector>; + using double_vector = std::vector>; + + int32_vector dtoi32_res; float_vector i64tof_res; float_vector dtof_res; + int64_vector ftoi64_res; + int64_vector dtoi64_res; double_vector i32tod_res; double_vector i64tod_res; double_vector ftod_res; +#endif bitwise_cast_test() : ftoi32_res(2 * N) - , dtoi32_res(2 * N) - , ftoi64_res(N) - , dtoi64_res(N) , i32tof_res(2 * N) +#ifndef XSIMD_WITH_ALTIVEC + , dtoi32_res(2 * N) , i64tof_res(2 * N) , dtof_res(2 * N) + , ftoi64_res(N) + , dtoi64_res(N) , i32tod_res(N) , i64tod_res(N) , ftod_res(N) +#endif { { int32_batch input = i32_input(); @@ -59,8 +66,22 @@ struct bitwise_cast_test b.i32[0] = input.get(0); b.i32[1] = input.get(1); std::fill(i32tof_res.begin(), i32tof_res.end(), b.f[0]); +#ifndef XSIMD_WITH_ALTIVEC std::fill(i32tod_res.begin(), i32tod_res.end(), b.d); +#endif + } + { + float_batch input = f_input(); + bitcast b; + b.f[0] = input.get(0); + b.f[1] = input.get(1); + std::fill(ftoi32_res.begin(), ftoi32_res.end(), b.i32[0]); +#ifndef XSIMD_WITH_ALTIVEC + std::fill(ftoi64_res.begin(), ftoi64_res.end(), b.i64); + std::fill(ftod_res.begin(), ftod_res.end(), b.d); +#endif } +#ifndef XSIMD_WITH_ALTIVEC { int64_batch input = i64_input(); bitcast b; @@ -72,15 +93,6 @@ struct bitwise_cast_test i64tof_res[2 * i + 1] = b.f[1]; } } - { - float_batch input = f_input(); - bitcast b; - b.f[0] = input.get(0); - b.f[1] = input.get(1); - std::fill(ftoi32_res.begin(), ftoi32_res.end(), b.i32[0]); - std::fill(ftoi64_res.begin(), ftoi64_res.end(), b.i64); - std::fill(ftod_res.begin(), ftod_res.end(), b.d); - } { double_batch input = d_input(); bitcast b; @@ -95,6 +107,7 @@ struct bitwise_cast_test dtof_res[2 * i + 1] = b.f[1]; } } +#endif } void test_to_int32() @@ -106,29 +119,14 @@ struct bitwise_cast_test INFO("to_int32(float)"); CHECK_VECTOR_EQ(i32vres, ftoi32_res); } +#ifndef XSIMD_WITH_ALTIVEC { int32_batch i32bres = xsimd::bitwise_cast(d_input()); i32bres.store_aligned(i32vres.data()); INFO("to_int32(double)"); CHECK_VECTOR_EQ(i32vres, dtoi32_res); } - } - - void test_to_int64() - { - int64_vector i64vres(int64_batch::size); - { - int64_batch i64bres = xsimd::bitwise_cast(f_input()); - i64bres.store_aligned(i64vres.data()); - INFO("to_int64(float)"); - CHECK_VECTOR_EQ(i64vres, ftoi64_res); - } - { - int64_batch i64bres = xsimd::bitwise_cast(d_input()); - i64bres.store_aligned(i64vres.data()); - INFO("to_int64(double)"); - CHECK_VECTOR_EQ(i64vres, dtoi64_res); - } +#endif } void test_to_float() @@ -140,6 +138,7 @@ struct bitwise_cast_test INFO("to_float(int32_t)"); CHECK_VECTOR_EQ(fvres, i32tof_res); } +#ifndef XSIMD_WITH_ALTIVEC { float_batch fbres = xsimd::bitwise_cast(i64_input()); fbres.store_aligned(fvres.data()); @@ -152,6 +151,26 @@ struct bitwise_cast_test INFO("to_float(double)"); CHECK_VECTOR_EQ(fvres, dtof_res); } +#endif + } + +#ifndef XSIMD_WITH_ALTIVEC + + void test_to_int64() + { + int64_vector i64vres(int64_batch::size); + { + int64_batch i64bres = xsimd::bitwise_cast(f_input()); + i64bres.store_aligned(i64vres.data()); + INFO("to_int64(float)"); + CHECK_VECTOR_EQ(i64vres, ftoi64_res); + } + { + int64_batch i64bres = xsimd::bitwise_cast(d_input()); + i64bres.store_aligned(i64vres.data()); + INFO("to_int64(double)"); + CHECK_VECTOR_EQ(i64vres, dtoi64_res); + } } void test_to_double() @@ -176,6 +195,7 @@ struct bitwise_cast_test CHECK_VECTOR_EQ(dvres, ftod_res); } } +#endif private: int32_batch i32_input() const @@ -183,20 +203,22 @@ struct bitwise_cast_test return int32_batch(2); } - int64_batch i64_input() const + float_batch f_input() const { - return int64_batch(2); + return float_batch(3.); } - float_batch f_input() const +#ifndef XSIMD_WITH_ALTIVEC + int64_batch i64_input() const { - return float_batch(3.); + return int64_batch(2); } double_batch d_input() const { return double_batch(2.5e17); } +#endif union bitcast { @@ -212,11 +234,13 @@ TEST_CASE_TEMPLATE("[bitwise cast]", B, CONVERSION_TYPES) bitwise_cast_test Test; SUBCASE("to_int32") { Test.test_to_int32(); } - SUBCASE("to_int64") { Test.test_to_int64(); } - SUBCASE("to_float") { Test.test_to_float(); } +#ifndef XSIMD_WITH_ALTIVEC + SUBCASE("to_int64") { Test.test_to_int64(); } + SUBCASE("to_double") { Test.test_to_double(); } +#endif } #endif #endif From d0be84bcb5e1800366647b983345acba20436aaa Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Fri, 18 Jul 2025 12:13:36 +0200 Subject: [PATCH 47/57] WIP --- test/test_load_store.cpp | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/test/test_load_store.cpp b/test/test_load_store.cpp index 7a4d80932..8da992b7d 100644 --- a/test/test_load_store.cpp +++ b/test/test_load_store.cpp @@ -32,14 +32,18 @@ struct load_store_test using uint16_vector_type = std::vector>; using int32_vector_type = std::vector>; using uint32_vector_type = std::vector>; +#ifndef XSIMD_WITH_ALTIVEC using int64_vector_type = std::vector>; using uint64_vector_type = std::vector>; +#endif #ifdef XSIMD_32_BIT_ABI using long_vector_type = std::vector>; using ulong_vector_type = std::vector>; #endif using float_vector_type = std::vector>; +#if (!XSIMD_WITH_NEON && !XSIMD_WITH_ALTIVEC) || XSIMD_WITH_NEON64 using double_vector_type = std::vector>; +#endif int8_vector_type i8_vec; uint8_vector_type ui8_vec; @@ -47,14 +51,18 @@ struct load_store_test uint16_vector_type ui16_vec; int32_vector_type i32_vec; uint32_vector_type ui32_vec; +#ifndef XSIMD_WITH_ALTIVEC int64_vector_type i64_vec; uint64_vector_type ui64_vec; +#endif #ifdef XSIMD_32_BIT_ABI long_vector_type l_vec; ulong_vector_type ul_vec; #endif float_vector_type f_vec; +#if (!XSIMD_WITH_NEON && !XSIMD_WITH_ALTIVEC) || XSIMD_WITH_NEON64 double_vector_type d_vec; +#endif array_type expected; @@ -66,14 +74,18 @@ struct load_store_test init_test_vector(ui16_vec); init_test_vector(i32_vec); init_test_vector(ui32_vec); +#ifndef XSIMD_WITH_ALTIVEC init_test_vector(i64_vec); init_test_vector(ui64_vec); +#endif #ifdef XSIMD_32_BIT_ABI init_test_vector(l_vec); init_test_vector(ul_vec); #endif init_test_vector(f_vec); +#if (!XSIMD_WITH_NEON && !XSIMD_WITH_ALTIVEC) || XSIMD_WITH_NEON64 init_test_vector(d_vec); +#endif } void test_load() @@ -84,14 +96,16 @@ struct load_store_test test_load_impl(ui16_vec, "load uint16_t"); test_load_impl(i32_vec, "load int32_t"); test_load_impl(ui32_vec, "load uint32_t"); +#ifndef XSIMD_WITH_ALTIVEC test_load_impl(i64_vec, "load int64_t"); test_load_impl(ui64_vec, "load uint64_t"); +#endif #ifdef XSIMD_32_BIT_ABI test_load_impl(l_vec, "load long"); test_load_impl(ul_vec, "load unsigned long"); #endif test_load_impl(f_vec, "load float"); -#if !XSIMD_WITH_NEON || XSIMD_WITH_NEON64 +#if (!XSIMD_WITH_NEON && !XSIMD_WITH_ALTIVEC) || XSIMD_WITH_NEON64 test_load_impl(d_vec, "load double"); #endif } @@ -104,14 +118,16 @@ struct load_store_test test_store_impl(ui16_vec, "load uint16_t"); test_store_impl(i32_vec, "load int32_t"); test_store_impl(ui32_vec, "load uint32_t"); +#ifndef XSIMD_WITH_ALTIVEC test_store_impl(i64_vec, "load int64_t"); test_store_impl(ui64_vec, "load uint64_t"); +#endif #ifdef XSIMD_32_BIT_ABI test_store_impl(l_vec, "load long"); test_store_impl(ul_vec, "load unsigned long"); #endif test_store_impl(f_vec, "load float"); -#if !XSIMD_WITH_NEON || XSIMD_WITH_NEON64 +#if (!XSIMD_WITH_NEON && !XSIMD_WITH_ALTIVEC) || XSIMD_WITH_NEON64 test_store_impl(d_vec, "load double"); #endif } @@ -123,15 +139,17 @@ struct load_store_test test_gather_impl(ui16_vec, "gather uint16_t"); test_gather_impl(i32_vec, "gather int32_t"); test_gather_impl(ui32_vec, "gather uint32_t"); +#ifndef XSIMD_WITH_ALTIVEC test_gather_impl(i64_vec, "gather int64_t"); test_gather_impl(ui64_vec, "gather uint64_t"); +#endif #ifdef XSIMD_32_BIT_ABI test_gather_impl(l_vec, "gather long"); test_gather_impl(ul_vec, "gather unsigned long"); #endif test_gather_impl(f_vec, "gather float"); test_gather_impl(f_vec, "gather float"); -#if !XSIMD_WITH_NEON || XSIMD_WITH_NEON64 +#if (!XSIMD_WITH_NEON && !XSIMD_WITH_ALTIVEC) || XSIMD_WITH_NEON64 test_gather_impl(d_vec, "gather double"); #endif } @@ -144,14 +162,16 @@ struct load_store_test test_scatter_impl(ui16_vec, "scatter uint16_t"); test_scatter_impl(i32_vec, "scatter int32_t"); test_scatter_impl(ui32_vec, "scatter uint32_t"); +#ifndef XSIMD_WITH_ALTIVEC test_scatter_impl(i64_vec, "scatter int64_t"); test_scatter_impl(ui64_vec, "scatter uint64_t"); +#endif #ifdef XSIMD_32_BIT_ABI test_scatter_impl(l_vec, "scatter long"); test_scatter_impl(ul_vec, "scatter unsigned long"); #endif test_scatter_impl(f_vec, "scatter float"); -#if !XSIMD_WITH_NEON || XSIMD_WITH_NEON64 +#if (!XSIMD_WITH_NEON && !XSIMD_WITH_ALTIVEC) || XSIMD_WITH_NEON64 test_scatter_impl(d_vec, "scatter double"); #endif } From 209d0c5a0c5cf56e80283c822077f6c4f8c316e9 Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Fri, 18 Jul 2025 12:39:08 +0200 Subject: [PATCH 48/57] WIP --- test/CMakeLists.txt | 8 ++++---- test/test_batch_cast.cpp | 6 +++--- test/test_bitwise_cast.cpp | 20 ++++++++++---------- test/test_explicit_batch_instantiation.cpp | 4 +++- test/test_load_store.cpp | 14 +++++++------- 5 files changed, 27 insertions(+), 25 deletions(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 883d2b39f..3e2840076 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -147,10 +147,10 @@ set(XSIMD_TESTS test_complex_trigonometric.cpp # test_conversion.cpp # test_custom_default_arch.cpp - # test_error_gamma.cpp - # test_explicit_batch_instantiation.cpp - # test_exponential.cpp - # test_extract_pair.cpp + test_error_gamma.cpp + test_explicit_batch_instantiation.cpp + test_exponential.cpp + test_extract_pair.cpp # test_fp_manipulation.cpp # test_hyperbolic.cpp test_load_store.cpp diff --git a/test/test_batch_cast.cpp b/test/test_batch_cast.cpp index 5d84176d3..e2e43a2ac 100644 --- a/test/test_batch_cast.cpp +++ b/test/test_batch_cast.cpp @@ -71,7 +71,7 @@ struct batch_cast_test using int32_batch = xsimd::batch; using uint32_batch = xsimd::batch; using float_batch = xsimd::batch; -#ifndef XSIMD_WITH_ALTIVEC +#if !XSIMD_WITH_ALTIVEC using int64_batch = xsimd::batch; using uint64_batch = xsimd::batch; using double_batch = xsimd::batch; @@ -185,7 +185,7 @@ struct batch_cast_test test_cast_impl(test_value, "batch cast uint32 -> uint32"); test_cast_impl(test_value, "batch cast uint32 -> float"); -#ifndef XSIMD_WITH_ALTIVEC +#if !XSIMD_WITH_ALTIVEC test_cast_impl(test_value, "batch cast int64 -> int64"); test_cast_impl(test_value, "batch cast int64 -> uint64"); test_cast_impl(test_value, "batch cast int64 -> double"); @@ -202,7 +202,7 @@ struct batch_cast_test test_cast_impl(test_value, "batch cast float -> float"); } -#ifndef XSIMD_WITH_ALTIVEC +#if !XSIMD_WITH_ALTIVEC for (const auto& test_value : double_test_values) { test_cast_impl(test_value, "batch cast double -> int64"); diff --git a/test/test_bitwise_cast.cpp b/test/test_bitwise_cast.cpp index 8efedeadf..ac9b5f050 100644 --- a/test/test_bitwise_cast.cpp +++ b/test/test_bitwise_cast.cpp @@ -29,7 +29,7 @@ struct bitwise_cast_test int32_vector ftoi32_res; float_vector i32tof_res; -#ifndef XSIMD_WITH_ALTIVEC +#if !XSIMD_WITH_ALTIVEC using int64_batch = xsimd::batch; using double_batch = xsimd::batch; @@ -49,7 +49,7 @@ struct bitwise_cast_test bitwise_cast_test() : ftoi32_res(2 * N) , i32tof_res(2 * N) -#ifndef XSIMD_WITH_ALTIVEC +#if !XSIMD_WITH_ALTIVEC , dtoi32_res(2 * N) , i64tof_res(2 * N) , dtof_res(2 * N) @@ -66,7 +66,7 @@ struct bitwise_cast_test b.i32[0] = input.get(0); b.i32[1] = input.get(1); std::fill(i32tof_res.begin(), i32tof_res.end(), b.f[0]); -#ifndef XSIMD_WITH_ALTIVEC +#if !XSIMD_WITH_ALTIVEC std::fill(i32tod_res.begin(), i32tod_res.end(), b.d); #endif } @@ -76,12 +76,12 @@ struct bitwise_cast_test b.f[0] = input.get(0); b.f[1] = input.get(1); std::fill(ftoi32_res.begin(), ftoi32_res.end(), b.i32[0]); -#ifndef XSIMD_WITH_ALTIVEC +#if !XSIMD_WITH_ALTIVEC std::fill(ftoi64_res.begin(), ftoi64_res.end(), b.i64); std::fill(ftod_res.begin(), ftod_res.end(), b.d); #endif } -#ifndef XSIMD_WITH_ALTIVEC +#if !XSIMD_WITH_ALTIVEC { int64_batch input = i64_input(); bitcast b; @@ -119,7 +119,7 @@ struct bitwise_cast_test INFO("to_int32(float)"); CHECK_VECTOR_EQ(i32vres, ftoi32_res); } -#ifndef XSIMD_WITH_ALTIVEC +#if !XSIMD_WITH_ALTIVEC { int32_batch i32bres = xsimd::bitwise_cast(d_input()); i32bres.store_aligned(i32vres.data()); @@ -138,7 +138,7 @@ struct bitwise_cast_test INFO("to_float(int32_t)"); CHECK_VECTOR_EQ(fvres, i32tof_res); } -#ifndef XSIMD_WITH_ALTIVEC +#if !XSIMD_WITH_ALTIVEC { float_batch fbres = xsimd::bitwise_cast(i64_input()); fbres.store_aligned(fvres.data()); @@ -154,7 +154,7 @@ struct bitwise_cast_test #endif } -#ifndef XSIMD_WITH_ALTIVEC +#if !XSIMD_WITH_ALTIVEC void test_to_int64() { @@ -208,7 +208,7 @@ struct bitwise_cast_test return float_batch(3.); } -#ifndef XSIMD_WITH_ALTIVEC +#if !XSIMD_WITH_ALTIVEC int64_batch i64_input() const { return int64_batch(2); @@ -236,7 +236,7 @@ TEST_CASE_TEMPLATE("[bitwise cast]", B, CONVERSION_TYPES) SUBCASE("to_float") { Test.test_to_float(); } -#ifndef XSIMD_WITH_ALTIVEC +#if !XSIMD_WITH_ALTIVEC SUBCASE("to_int64") { Test.test_to_int64(); } SUBCASE("to_double") { Test.test_to_double(); } diff --git a/test/test_explicit_batch_instantiation.cpp b/test/test_explicit_batch_instantiation.cpp index f988a6e06..290adc63e 100644 --- a/test/test_explicit_batch_instantiation.cpp +++ b/test/test_explicit_batch_instantiation.cpp @@ -22,10 +22,12 @@ namespace xsimd template class batch; template class batch; template class batch; +#if !XSIMD_WITH_ALTIVEC template class batch; template class batch; +#endif template class batch; -#if !XSIMD_WITH_NEON || XSIMD_WITH_NEON64 +#if (!XSIMD_WITH_NEON && !XSIMD_WITH_ALTIVEC) || XSIMD_WITH_NEON64 template class batch; #endif } diff --git a/test/test_load_store.cpp b/test/test_load_store.cpp index 8da992b7d..0d149c7fa 100644 --- a/test/test_load_store.cpp +++ b/test/test_load_store.cpp @@ -32,7 +32,7 @@ struct load_store_test using uint16_vector_type = std::vector>; using int32_vector_type = std::vector>; using uint32_vector_type = std::vector>; -#ifndef XSIMD_WITH_ALTIVEC +#if !XSIMD_WITH_ALTIVEC using int64_vector_type = std::vector>; using uint64_vector_type = std::vector>; #endif @@ -51,7 +51,7 @@ struct load_store_test uint16_vector_type ui16_vec; int32_vector_type i32_vec; uint32_vector_type ui32_vec; -#ifndef XSIMD_WITH_ALTIVEC +#if !XSIMD_WITH_ALTIVEC int64_vector_type i64_vec; uint64_vector_type ui64_vec; #endif @@ -74,7 +74,7 @@ struct load_store_test init_test_vector(ui16_vec); init_test_vector(i32_vec); init_test_vector(ui32_vec); -#ifndef XSIMD_WITH_ALTIVEC +#if !XSIMD_WITH_ALTIVEC init_test_vector(i64_vec); init_test_vector(ui64_vec); #endif @@ -96,7 +96,7 @@ struct load_store_test test_load_impl(ui16_vec, "load uint16_t"); test_load_impl(i32_vec, "load int32_t"); test_load_impl(ui32_vec, "load uint32_t"); -#ifndef XSIMD_WITH_ALTIVEC +#if !XSIMD_WITH_ALTIVEC test_load_impl(i64_vec, "load int64_t"); test_load_impl(ui64_vec, "load uint64_t"); #endif @@ -118,7 +118,7 @@ struct load_store_test test_store_impl(ui16_vec, "load uint16_t"); test_store_impl(i32_vec, "load int32_t"); test_store_impl(ui32_vec, "load uint32_t"); -#ifndef XSIMD_WITH_ALTIVEC +#if !XSIMD_WITH_ALTIVEC test_store_impl(i64_vec, "load int64_t"); test_store_impl(ui64_vec, "load uint64_t"); #endif @@ -139,7 +139,7 @@ struct load_store_test test_gather_impl(ui16_vec, "gather uint16_t"); test_gather_impl(i32_vec, "gather int32_t"); test_gather_impl(ui32_vec, "gather uint32_t"); -#ifndef XSIMD_WITH_ALTIVEC +#if !XSIMD_WITH_ALTIVEC test_gather_impl(i64_vec, "gather int64_t"); test_gather_impl(ui64_vec, "gather uint64_t"); #endif @@ -162,7 +162,7 @@ struct load_store_test test_scatter_impl(ui16_vec, "scatter uint16_t"); test_scatter_impl(i32_vec, "scatter int32_t"); test_scatter_impl(ui32_vec, "scatter uint32_t"); -#ifndef XSIMD_WITH_ALTIVEC +#if !XSIMD_WITH_ALTIVEC test_scatter_impl(i64_vec, "scatter int64_t"); test_scatter_impl(ui64_vec, "scatter uint64_t"); #endif From 6bb983045d3543905b704f98d15670101276197e Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Fri, 18 Jul 2025 13:28:34 +0200 Subject: [PATCH 49/57] WIP --- include/xsimd/arch/xsimd_altivec.hpp | 8 ++++---- test/CMakeLists.txt | 6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp index 7cbb861c9..a6680e2e8 100644 --- a/include/xsimd/arch/xsimd_altivec.hpp +++ b/include/xsimd/arch/xsimd_altivec.hpp @@ -444,15 +444,15 @@ namespace xsimd template ::value, void>::type> XSIMD_INLINE batch load_aligned(T const* mem, convert, requires_arch) noexcept { - return vec_ld(0, mem); + return vec_ld(0, reinterpret_cast::register_type*>(mem)); } // load_unaligned template ::value, void>::type> XSIMD_INLINE batch load_unaligned(T const* mem, convert, requires_arch) noexcept { - auto lo = vec_ld(0, mem); - auto hi = vec_ld(16, mem); + auto lo = vec_ld(0, reinterpret_cast::register_type*>(mem)); + auto hi = vec_ld(16, reinterpret_cast::register_type*>(mem)); return vec_perm(lo, hi, vec_lvsl(0, mem)); } @@ -794,7 +794,7 @@ namespace xsimd template ::value, void>::type> XSIMD_INLINE void store_aligned(T* mem, batch const& self, requires_arch) noexcept { - return vec_st(self.data, 0, mem); + return vec_st(self.data, 0, reinterpret_cast::register_type*>(mem)); } // store_unaligned diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 3e2840076..bd6b98ef8 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -146,15 +146,15 @@ set(XSIMD_TESTS test_complex_power.cpp test_complex_trigonometric.cpp # test_conversion.cpp - # test_custom_default_arch.cpp + test_custom_default_arch.cpp test_error_gamma.cpp test_explicit_batch_instantiation.cpp test_exponential.cpp test_extract_pair.cpp # test_fp_manipulation.cpp - # test_hyperbolic.cpp + test_hyperbolic.cpp test_load_store.cpp - # test_memory.cpp + test_memory.cpp # test_poly_evaluation.cpp # test_power.cpp # test_rounding.cpp From 86b3615e40468b07ebd05263d2a98c4283af1689 Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Sat, 19 Jul 2025 22:04:42 +0200 Subject: [PATCH 50/57] WIP --- test/CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index bd6b98ef8..c2db5fa0f 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -155,8 +155,8 @@ set(XSIMD_TESTS test_hyperbolic.cpp test_load_store.cpp test_memory.cpp - # test_poly_evaluation.cpp - # test_power.cpp + test_poly_evaluation.cpp + test_power.cpp # test_rounding.cpp # test_select.cpp # test_shuffle.cpp From d65c8edb13a4f0a503c073c4e9238c31510054e3 Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Sat, 19 Jul 2025 23:02:05 +0200 Subject: [PATCH 51/57] WIP --- test/CMakeLists.txt | 6 +++--- test/test_conversion.cpp | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index c2db5fa0f..9e91c2047 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -145,19 +145,19 @@ set(XSIMD_TESTS test_complex_hyperbolic.cpp test_complex_power.cpp test_complex_trigonometric.cpp - # test_conversion.cpp + test_conversion.cpp test_custom_default_arch.cpp test_error_gamma.cpp test_explicit_batch_instantiation.cpp test_exponential.cpp test_extract_pair.cpp - # test_fp_manipulation.cpp + test_fp_manipulation.cpp test_hyperbolic.cpp test_load_store.cpp test_memory.cpp test_poly_evaluation.cpp test_power.cpp - # test_rounding.cpp + test_rounding.cpp # test_select.cpp # test_shuffle.cpp test_sum.cpp diff --git a/test/test_conversion.cpp b/test/test_conversion.cpp index 153920ac0..47950e80e 100644 --- a/test/test_conversion.cpp +++ b/test/test_conversion.cpp @@ -14,7 +14,7 @@ #include "test_utils.hpp" -#if !XSIMD_WITH_NEON || XSIMD_WITH_NEON64 +#if (!XSIMD_WITH_NEON && !XSIMD_WITH_ALTIVEC) || XSIMD_WITH_NEON64 template struct conversion_test { From 0cb875c89350d4f8c9f58c48b412d15400c0e264 Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Mon, 21 Jul 2025 08:04:47 +0200 Subject: [PATCH 52/57] WIP --- test/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 9e91c2047..6e6b65556 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -140,7 +140,7 @@ set(XSIMD_TESTS test_batch_int.cpp test_bitwise_cast.cpp test_batch_constant.cpp - # test_batch_manip.cpp + test_batch_manip.cpp test_complex_exponential.cpp test_complex_hyperbolic.cpp test_complex_power.cpp From 88b3e0b700ca54083fadbe5e01baadf7f8ff6c8e Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Mon, 21 Jul 2025 08:25:23 +0200 Subject: [PATCH 53/57] WIP --- test/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 6e6b65556..6302eb07c 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -158,7 +158,7 @@ set(XSIMD_TESTS test_poly_evaluation.cpp test_power.cpp test_rounding.cpp - # test_select.cpp + test_select.cpp # test_shuffle.cpp test_sum.cpp test_traits.cpp From 7e6e837cea58aef18e6e690ee68515db9cf356c9 Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Mon, 21 Jul 2025 08:43:08 +0200 Subject: [PATCH 54/57] WIP --- test/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 6302eb07c..23c9345bb 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -159,7 +159,7 @@ set(XSIMD_TESTS test_power.cpp test_rounding.cpp test_select.cpp - # test_shuffle.cpp + test_shuffle.cpp test_sum.cpp test_traits.cpp test_trigonometric.cpp From a7f64dc92a58cf99517d7b72a19246fc23eafc7c Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Mon, 21 Jul 2025 09:07:42 +0200 Subject: [PATCH 55/57] WIP --- include/xsimd/arch/xsimd_altivec.hpp | 2 +- test/CMakeLists.txt | 2 +- test/test_shuffle.cpp | 22 +++++++++++++++++++--- 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp index a6680e2e8..5e441f90e 100644 --- a/include/xsimd/arch/xsimd_altivec.hpp +++ b/include/xsimd/arch/xsimd_altivec.hpp @@ -932,7 +932,7 @@ namespace xsimd #endif // zip_hi - template ::value, void>::type> + template ::value, void>::type> XSIMD_INLINE batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept { return vec_mergeh(self.data, other.data); diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 23c9345bb..e6bad7999 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -163,7 +163,7 @@ set(XSIMD_TESTS test_sum.cpp test_traits.cpp test_trigonometric.cpp - # test_xsimd_api.cpp + test_xsimd_api.cpp test_utils.hpp ) diff --git a/test/test_shuffle.cpp b/test/test_shuffle.cpp index bc89aafd5..a87428d00 100644 --- a/test/test_shuffle.cpp +++ b/test/test_shuffle.cpp @@ -94,6 +94,8 @@ struct zip_test : zip_base #if !XSIMD_WITH_AVX512F || XSIMD_WITH_AVX512BW #define ZIP_BATCH_TYPES BATCH_TYPES +#elif XSIMD_WITH_ALTIVEC +#define ZIP_BATCH_TYPES xsimd::batch, xsimd::batch #else #define ZIP_BATCH_TYPES xsimd::batch, xsimd::batch, xsimd::batch, xsimd::batch #endif @@ -347,7 +349,13 @@ struct compress_test } }; -TEST_CASE_TEMPLATE("[compress]", B, BATCH_FLOAT_TYPES, xsimd::batch, xsimd::batch, xsimd::batch, xsimd::batch, xsimd::batch, xsimd::batch, xsimd::batch, xsimd::batch) +#if XSIMD_WITH_ALTIVEC +#define XSIMD_COMPRESS_TYPES BATCH_FLOAT_TYPES, xsimd::batch, xsimd::batch, xsimd::batch, xsimd::batch, xsimd::batch, xsimd::batch +#else +#define XSIMD_COMPRESS_TYPES BATCH_FLOAT_TYPES, xsimd::batch, xsimd::batch, xsimd::batch, xsimd::batch, xsimd::batch, xsimd::batch, xsimd::batch, xsimd::batch +#endif + +TEST_CASE_TEMPLATE("[compress]", B, XSIMD_COMPRESS_TYPES) { compress_test Test; SUBCASE("empty") @@ -443,7 +451,9 @@ struct expand_test } }; -TEST_CASE_TEMPLATE("[expand]", B, BATCH_FLOAT_TYPES, xsimd::batch, xsimd::batch, xsimd::batch, xsimd::batch, xsimd::batch, xsimd::batch, xsimd::batch, xsimd::batch) +#define XSIMD_EXPAND_TYPES XSIMD_COMPRESS_TYPES + +TEST_CASE_TEMPLATE("[expand]", B, XSIMD_EXPAND_TYPES) { expand_test Test; SUBCASE("empty") @@ -690,7 +700,13 @@ struct shuffle_test } }; -TEST_CASE_TEMPLATE("[shuffle]", B, BATCH_FLOAT_TYPES, xsimd::batch, xsimd::batch, xsimd::batch, xsimd::batch) +#if XSIMD_WITH_ALTIVEC +#define XSIMD_SHUFFLE_TYPES BATCH_FLOAT_TYPES, xsimd::batch, xsimd::batch +#else +#define XSIMD_SHUFFLE_TYPES BATCH_FLOAT_TYPES, xsimd::batch, xsimd::batch, xsimd::batch, xsimd::batch +#endif + +TEST_CASE_TEMPLATE("[shuffle]", B, XSIMD_SHUFFLE_TYPES) { shuffle_test Test; SUBCASE("no-op") From 6b975ac3ef6e857fbfd0f1af039af2cd264b3a72 Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Mon, 21 Jul 2025 11:45:12 +0200 Subject: [PATCH 56/57] WIP --- include/xsimd/arch/xsimd_altivec.hpp | 4 ++-- include/xsimd/types/xsimd_altivec_register.hpp | 2 ++ 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp index 5e441f90e..41d845cac 100644 --- a/include/xsimd/arch/xsimd_altivec.hpp +++ b/include/xsimd/arch/xsimd_altivec.hpp @@ -750,14 +750,14 @@ namespace xsimd template XSIMD_INLINE batch slide_left(batch const& x, requires_arch) noexcept { - return vec_sll(x.data, vec_splat_u8(N)); + return vec_sll(x.data, vec_splats((uint32_t)N)); } // slide_right template XSIMD_INLINE batch slide_right(batch const& x, requires_arch) noexcept { - return vec_srl(x.data, vec_splat_u8(N)); + return vec_srl(x.data, vec_splats((uint32_t)N)); } // sadd diff --git a/include/xsimd/types/xsimd_altivec_register.hpp b/include/xsimd/types/xsimd_altivec_register.hpp index 2ed7d89ee..36f117122 100644 --- a/include/xsimd/types/xsimd_altivec_register.hpp +++ b/include/xsimd/types/xsimd_altivec_register.hpp @@ -64,6 +64,8 @@ namespace xsimd XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER(short, short); XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER(unsigned int, int); XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER(int, int); + XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER(unsigned long, long); + XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER(long, long); XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER(float, int); #undef XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER From 51ef120d881ac24608263508dc825f2d38156f5e Mon Sep 17 00:00:00 2001 From: serge-sans-paille Date: Tue, 22 Jul 2025 12:21:40 +0200 Subject: [PATCH 57/57] WIP --- include/xsimd/types/xsimd_altivec_register.hpp | 1 + 1 file changed, 1 insertion(+) diff --git a/include/xsimd/types/xsimd_altivec_register.hpp b/include/xsimd/types/xsimd_altivec_register.hpp index 36f117122..4de69ea3c 100644 --- a/include/xsimd/types/xsimd_altivec_register.hpp +++ b/include/xsimd/types/xsimd_altivec_register.hpp @@ -67,6 +67,7 @@ namespace xsimd XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER(unsigned long, long); XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER(long, long); XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER(float, int); + XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER(double, long); #undef XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER }