From 55cd52c1f9675c557bd1a88755a0604239055aea Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Sun, 13 Jul 2025 13:22:29 +0200
Subject: [PATCH 01/57] Tentative support for altivec

---
 .github/workflows/cross.yml                   |    1 +
 CMakeLists.txt                                |    3 +
 docs/Doxyfile                                 |    1 +
 include/xsimd/arch/xsimd_altivec.hpp          | 1837 +++++++++++++++++
 include/xsimd/arch/xsimd_isa.hpp              |    4 +
 include/xsimd/config/xsimd_config.hpp         |   13 +-
 include/xsimd/types/xsimd_all_registers.hpp   |    2 +
 .../xsimd/types/xsimd_altivec_register.hpp    |   57 +
 8 files changed, 1917 insertions(+), 1 deletion(-)
 create mode 100644 include/xsimd/arch/xsimd_altivec.hpp
 create mode 100644 include/xsimd/types/xsimd_altivec_register.hpp

diff --git a/.github/workflows/cross.yml b/.github/workflows/cross.yml
index 071e85f25..e71096f02 100644
--- a/.github/workflows/cross.yml
+++ b/.github/workflows/cross.yml
@@ -13,6 +13,7 @@ jobs:
           - { platform: 'arm',     arch: 'armv7-a', dir: 'arm-linux-gnueabihf', flags: '-mfpu=neon', full: 'ON'}
           - { platform: 'arm',     arch: 'armv7-a', dir: 'arm-linux-gnueabihf', flags: '-mfpu=vfpv3-d16', full: 'OFF' } # no neon
           - { platform: 'aarch64', arch: 'armv8-a', dir: 'aarch64-linux-gnu', flags: '', full: 'ON' }
+          - { platform: 'ppc', arch: 'powerpc', dir: 'powerpc-linux-gnu', flags: '-maltivec', full: 'OFF' }
         sys:
           - { compiler: 'gcc',   version: '10' }
     steps:
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 860a84bad..ea30d6814 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -35,6 +35,7 @@ message(STATUS "xsimd v${${PROJECT_NAME}_VERSION}")
 set(XSIMD_HEADERS
 ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_constants.hpp
 ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_fma3_avx.hpp
+${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_fma3_avx.hpp
 ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_fma3_avx2.hpp
 ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_fma3_sse.hpp
 ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_fma4.hpp
@@ -49,6 +50,7 @@ ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_sse3.hpp
 ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_sse4_1.hpp
 ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_sse4_2.hpp
 ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_ssse3.hpp
+${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_altivec.hpp
 ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_sve.hpp
 ${XSIMD_INCLUDE_DIR}/xsimd/config/xsimd_arch.hpp
 ${XSIMD_INCLUDE_DIR}/xsimd/config/xsimd_config.hpp
@@ -70,6 +72,7 @@ ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_fma3_sse_register.hpp
 ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_fma4_register.hpp
 ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_common_arch.hpp
 ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_register.hpp
+${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_altivec_register.hpp
 ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_rvv_register.hpp
 ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_sse2_register.hpp
 ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_sse3_register.hpp
diff --git a/docs/Doxyfile b/docs/Doxyfile
index 390baf223..9de40e8da 100644
--- a/docs/Doxyfile
+++ b/docs/Doxyfile
@@ -9,6 +9,7 @@ INPUT             = ../include/xsimd/types/xsimd_api.hpp \
                     ../include/xsimd/memory/xsimd_aligned_allocator.hpp \
                     ../include/xsimd/types/xsimd_common_arch.hpp \
                     ../include/xsimd/types/xsimd_traits.hpp \
+                    ../include/xsimd/types/xsimd_altivec_register.hpp \
                     ../include/xsimd/types/xsimd_avx2_register.hpp \
                     ../include/xsimd/types/xsimd_avx512bw_register.hpp \
                     ../include/xsimd/types/xsimd_avx512cd_register.hpp \
diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp
new file mode 100644
index 000000000..23206ad5d
--- /dev/null
+++ b/include/xsimd/arch/xsimd_altivec.hpp
@@ -0,0 +1,1837 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_ALTIVEC_HPP
+#define XSIMD_ALTIVEC_HPP
+
+#include <complex>
+#include <limits>
+#include <type_traits>
+
+#include "../types/xsimd_altivec_register.hpp"
+
+namespace xsimd
+{
+    template <typename T, class A, bool... Values>
+    struct batch_bool_constant;
+
+    template <class T_out, class T_in, class A>
+    XSIMD_INLINE batch<T_out, A> bitwise_cast(batch<T_in, A> const& x) noexcept;
+
+    template <typename T, class A, T... Values>
+    struct batch_constant;
+
+    namespace kernel
+    {
+#if 0
+        using namespace types;
+
+        namespace detail
+        {
+            constexpr uint32_t shuffle(uint32_t w, uint32_t x, uint32_t y, uint32_t z)
+            {
+                return (z << 6) | (y << 4) | (x << 2) | w;
+            }
+            constexpr uint32_t shuffle(uint32_t x, uint32_t y)
+            {
+                return (y << 1) | x;
+            }
+
+            constexpr uint32_t mod_shuffle(uint32_t w, uint32_t x, uint32_t y, uint32_t z)
+            {
+                return shuffle(w % 4, x % 4, y % 4, z % 4);
+            }
+
+            constexpr uint32_t mod_shuffle(uint32_t w, uint32_t x)
+            {
+                return shuffle(w % 2, x % 2);
+            }
+        }
+
+        // fwd
+        template <class A, class T, size_t I>
+        XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<common>) noexcept;
+        template <class A, typename T, typename ITy, ITy... Indices>
+        XSIMD_INLINE batch<T, A> shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<ITy, A, Indices...>, requires_arch<common>) noexcept;
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> avg(batch<T, A> const&, batch<T, A> const&, requires_arch<common>) noexcept;
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> avgr(batch<T, A> const&, batch<T, A> const&, requires_arch<common>) noexcept;
+
+        // abs
+        template <class A>
+        XSIMD_INLINE batch<double, A> abs(batch<double, A> const& self, requires_arch<sse2>) noexcept
+        {
+            __m128d sign_mask = _mm_set1_pd(-0.f); // -0.f = 1 << 31
+            return _mm_andnot_pd(sign_mask, self);
+        }
+        template <class A>
+        XSIMD_INLINE batch<float, A> abs(batch<float, A> const& self, requires_arch<sse2>) noexcept
+        {
+            __m128 sign_mask = _mm_set1_ps(-0.f); // -0.f = 1 << 31
+            return _mm_andnot_ps(sign_mask, self);
+        }
+
+        // add
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm_add_epi8(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm_add_epi16(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm_add_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm_add_epi64(self, other);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<float, A> add(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_add_ps(self, other);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> add(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_add_pd(self, other);
+        }
+
+        // all
+        template <class A>
+        XSIMD_INLINE bool all(batch_bool<float, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_movemask_ps(self) == 0x0F;
+        }
+        template <class A>
+        XSIMD_INLINE bool all(batch_bool<double, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_movemask_pd(self) == 0x03;
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE bool all(batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_movemask_epi8(self) == 0xFFFF;
+        }
+
+        // any
+        template <class A>
+        XSIMD_INLINE bool any(batch_bool<float, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_movemask_ps(self) != 0;
+        }
+        template <class A>
+        XSIMD_INLINE bool any(batch_bool<double, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_movemask_pd(self) != 0;
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE bool any(batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_movemask_epi8(self) != 0;
+        }
+
+        // avgr
+        template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> avgr(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm_avg_epu8(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm_avg_epu16(self, other);
+            }
+            else
+            {
+                return avgr(self, other, common {});
+            }
+        }
+
+        // avg
+        template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> avg(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                auto adj = ((self ^ other) << 7) >> 7;
+                return avgr(self, other, A {}) - adj;
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                auto adj = ((self ^ other) << 15) >> 15;
+                return avgr(self, other, A {}) - adj;
+            }
+            else
+            {
+                return avg(self, other, common {});
+            }
+        }
+
+        // batch_bool_cast
+        template <class A, class T_out, class T_in>
+        XSIMD_INLINE batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<sse2>) noexcept
+        {
+            return { bitwise_cast<T_out>(batch<T_in, A>(self.data)).data };
+        }
+
+        // bitwise_and
+        template <class A>
+        XSIMD_INLINE batch<float, A> bitwise_and(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_and_ps(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> bitwise_and(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_and_ps(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_and_si128(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_and_si128(self, other);
+        }
+
+        template <class A>
+        batch<double, A> XSIMD_INLINE bitwise_and(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_and_pd(self, other);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> bitwise_and(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_and_pd(self, other);
+        }
+
+        // bitwise_andnot
+        template <class A>
+        XSIMD_INLINE batch<float, A> bitwise_andnot(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_andnot_ps(other, self);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> bitwise_andnot(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_andnot_ps(other, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_andnot_si128(other, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_andnot_si128(other, self);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> bitwise_andnot(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_andnot_pd(other, self);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> bitwise_andnot(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_andnot_pd(other, self);
+        }
+
+        // bitwise_lshift
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm_and_si128(_mm_set1_epi8(0xFF << other), _mm_slli_epi32(self, other));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm_slli_epi16(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm_slli_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm_slli_epi64(self, other);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+
+        // bitwise_not
+        template <class A>
+        XSIMD_INLINE batch<float, A> bitwise_not(batch<float, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_ps(self, _mm_castsi128_ps(_mm_set1_epi32(-1)));
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> bitwise_not(batch_bool<float, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_ps(self, _mm_castsi128_ps(_mm_set1_epi32(-1)));
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_si128(self, _mm_set1_epi32(-1));
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_si128(self, _mm_set1_epi32(-1));
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> bitwise_not(batch<double, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_pd(self, _mm_castsi128_pd(_mm_set1_epi32(-1)));
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> bitwise_not(batch_bool<double, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_pd(self, _mm_castsi128_pd(_mm_set1_epi32(-1)));
+        }
+
+        // bitwise_or
+        template <class A>
+        XSIMD_INLINE batch<float, A> bitwise_or(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_or_ps(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> bitwise_or(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_or_ps(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_or_si128(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_or_si128(self, other);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> bitwise_or(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_or_pd(self, other);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> bitwise_or(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_or_pd(self, other);
+        }
+
+        // bitwise_rshift
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<sse2>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    __m128i sign_mask = _mm_set1_epi16((0xFF00 >> other) & 0x00FF);
+                    __m128i cmp_is_negative = _mm_cmpgt_epi8(_mm_setzero_si128(), self);
+                    __m128i res = _mm_srai_epi16(self, other);
+                    return _mm_or_si128(_mm_and_si128(sign_mask, cmp_is_negative), _mm_andnot_si128(sign_mask, res));
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm_srai_epi16(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm_srai_epi32(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    // from https://github.com/samyvilar/vect/blob/master/vect_128.h
+                    return _mm_or_si128(
+                        _mm_srli_epi64(self, other),
+                        _mm_slli_epi64(
+                            _mm_srai_epi32(_mm_shuffle_epi32(self, _MM_SHUFFLE(3, 3, 1, 1)), 32),
+                            64 - other));
+                }
+                else
+                {
+                    assert(false && "unsupported arch/op combination");
+                    return {};
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm_and_si128(_mm_set1_epi8(0xFF >> other), _mm_srli_epi32(self, other));
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm_srli_epi16(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm_srli_epi32(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return _mm_srli_epi64(self, other);
+                }
+                else
+                {
+                    assert(false && "unsupported arch/op combination");
+                    return {};
+                }
+            }
+        }
+
+        // bitwise_xor
+        template <class A>
+        XSIMD_INLINE batch<float, A> bitwise_xor(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_ps(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> bitwise_xor(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_ps(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_si128(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> bitwise_xor(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_pd(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> bitwise_xor(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_pd(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_si128(self, other);
+        }
+
+        // bitwise_cast
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<float, A> bitwise_cast(batch<T, A> const& self, batch<float, A> const&, requires_arch<sse2>) noexcept
+        {
+            return _mm_castsi128_ps(self);
+        }
+        template <class A, class T, class Tp, class = typename std::enable_if<std::is_integral<typename std::common_type<T, Tp>::type>::value, void>::type>
+        XSIMD_INLINE batch<Tp, A> bitwise_cast(batch<T, A> const& self, batch<Tp, A> const&, requires_arch<sse2>) noexcept
+        {
+            return batch<Tp, A>(self.data);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_cast(batch<float, A> const& self, batch<T, A> const&, requires_arch<sse2>) noexcept
+        {
+            return _mm_castps_si128(self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<double, A> bitwise_cast(batch<T, A> const& self, batch<double, A> const&, requires_arch<sse2>) noexcept
+        {
+            return _mm_castsi128_pd(self);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> bitwise_cast(batch<float, A> const& self, batch<double, A> const&, requires_arch<sse2>) noexcept
+        {
+            return _mm_castps_pd(self);
+        }
+        template <class A>
+        XSIMD_INLINE batch<float, A> bitwise_cast(batch<double, A> const& self, batch<float, A> const&, requires_arch<sse2>) noexcept
+        {
+            return _mm_castpd_ps(self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_cast(batch<double, A> const& self, batch<T, A> const&, requires_arch<sse2>) noexcept
+        {
+            return _mm_castpd_si128(self);
+        }
+
+        // broadcast
+        template <class A>
+        batch<float, A> XSIMD_INLINE broadcast(float val, requires_arch<sse2>) noexcept
+        {
+            return _mm_set1_ps(val);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm_set1_epi8(val);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm_set1_epi16(val);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm_set1_epi32(val);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm_set1_epi64x(val);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> broadcast(double val, requires_arch<sse2>) noexcept
+        {
+            return _mm_set1_pd(val);
+        }
+
+        // store_complex
+        namespace detail
+        {
+            // Override these methods in SSE-based archs, no need to override store_aligned / store_unaligned
+            // complex_low
+            template <class A>
+            XSIMD_INLINE batch<float, A> complex_low(batch<std::complex<float>, A> const& self, requires_arch<sse2>) noexcept
+            {
+                return _mm_unpacklo_ps(self.real(), self.imag());
+            }
+            // complex_high
+            template <class A>
+            XSIMD_INLINE batch<float, A> complex_high(batch<std::complex<float>, A> const& self, requires_arch<sse2>) noexcept
+            {
+                return _mm_unpackhi_ps(self.real(), self.imag());
+            }
+            template <class A>
+            XSIMD_INLINE batch<double, A> complex_low(batch<std::complex<double>, A> const& self, requires_arch<sse2>) noexcept
+            {
+                return _mm_unpacklo_pd(self.real(), self.imag());
+            }
+            template <class A>
+            XSIMD_INLINE batch<double, A> complex_high(batch<std::complex<double>, A> const& self, requires_arch<sse2>) noexcept
+            {
+                return _mm_unpackhi_pd(self.real(), self.imag());
+            }
+        }
+
+        // decr_if
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> decr_if(batch<T, A> const& self, batch_bool<T, A> const& mask, requires_arch<sse2>) noexcept
+        {
+            return self + batch<T, A>(mask.data);
+        }
+
+        // div
+        template <class A>
+        XSIMD_INLINE batch<float, A> div(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_div_ps(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> div(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_div_pd(self, other);
+        }
+
+        // fast_cast
+        namespace detail
+        {
+            template <class A>
+            XSIMD_INLINE batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&, requires_arch<sse2>) noexcept
+            {
+                return _mm_cvtepi32_ps(self);
+            }
+
+            template <class A>
+            XSIMD_INLINE batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<sse2>) noexcept
+            {
+                // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
+                // adapted to sse2
+                __m128i xH = _mm_srli_epi64(x, 32);
+                xH = _mm_or_si128(xH, _mm_castpd_si128(_mm_set1_pd(19342813113834066795298816.))); //  2^84
+                __m128i mask = _mm_setr_epi16(0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000);
+                __m128i xL = _mm_or_si128(_mm_and_si128(mask, x), _mm_andnot_si128(mask, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)))); //  2^52
+                __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.)); //  2^84 + 2^52
+                return _mm_add_pd(f, _mm_castsi128_pd(xL));
+            }
+
+            template <class A>
+            XSIMD_INLINE batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<sse2>) noexcept
+            {
+                // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
+                // adapted to sse2
+                __m128i xH = _mm_srai_epi32(x, 16);
+                xH = _mm_and_si128(xH, _mm_setr_epi16(0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF));
+                xH = _mm_add_epi64(xH, _mm_castpd_si128(_mm_set1_pd(442721857769029238784.))); //  3*2^67
+                __m128i mask = _mm_setr_epi16(0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000);
+                __m128i xL = _mm_or_si128(_mm_and_si128(mask, x), _mm_andnot_si128(mask, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)))); //  2^52
+                __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(442726361368656609280.)); //  3*2^67 + 2^52
+                return _mm_add_pd(f, _mm_castsi128_pd(xL));
+            }
+
+            template <class A>
+            XSIMD_INLINE batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<sse2>) noexcept
+            {
+                return _mm_cvttps_epi32(self);
+            }
+        }
+
+        // eq
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> eq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmpeq_ps(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> eq(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(self), _mm_castps_si128(other)));
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm_cmpeq_epi8(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm_cmpeq_epi16(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm_cmpeq_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                __m128i tmp1 = _mm_cmpeq_epi32(self, other);
+                __m128i tmp2 = _mm_shuffle_epi32(tmp1, 0xB1);
+                __m128i tmp3 = _mm_and_si128(tmp1, tmp2);
+                __m128i tmp4 = _mm_srai_epi32(tmp3, 31);
+                return _mm_shuffle_epi32(tmp4, 0xF5);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return ~(self != other);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> eq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmpeq_pd(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> eq(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_castsi128_pd(_mm_cmpeq_epi32(_mm_castpd_si128(self), _mm_castpd_si128(other)));
+        }
+
+        // first
+        template <class A>
+        XSIMD_INLINE float first(batch<float, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_cvtss_f32(self);
+        }
+
+        template <class A>
+        XSIMD_INLINE double first(batch<double, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_cvtsd_f64(self);
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE T first(batch<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return static_cast<T>(_mm_cvtsi128_si32(self) & 0xFF);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return static_cast<T>(_mm_cvtsi128_si32(self) & 0xFFFF);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return static_cast<T>(_mm_cvtsi128_si32(self));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+#if defined(__x86_64__)
+                return static_cast<T>(_mm_cvtsi128_si64(self));
+#else
+                __m128i m;
+                _mm_storel_epi64(&m, self);
+                int64_t i;
+                std::memcpy(&i, &m, sizeof(i));
+                return i;
+#endif
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+
+        // from_mask
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> from_mask(batch_bool<float, A> const&, uint64_t mask, requires_arch<sse2>) noexcept
+        {
+            alignas(A::alignment()) static const uint32_t lut[][4] = {
+                { 0x00000000, 0x00000000, 0x00000000, 0x00000000 },
+                { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 },
+                { 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000 },
+                { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 },
+                { 0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000 },
+                { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 },
+                { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 },
+                { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 },
+                { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF },
+                { 0xFFFFFFFF, 0x00000000, 0x00000000, 0xFFFFFFFF },
+                { 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF },
+                { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF },
+                { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF },
+                { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF },
+                { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF },
+                { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF },
+            };
+            assert(!(mask & ~0xFul) && "inbound mask");
+            return _mm_castsi128_ps(_mm_load_si128((const __m128i*)lut[mask]));
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> from_mask(batch_bool<double, A> const&, uint64_t mask, requires_arch<sse2>) noexcept
+        {
+            alignas(A::alignment()) static const uint64_t lut[][4] = {
+                { 0x0000000000000000ul, 0x0000000000000000ul },
+                { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul },
+                { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul },
+                { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul },
+            };
+            assert(!(mask & ~0x3ul) && "inbound mask");
+            return _mm_castsi128_pd(_mm_load_si128((const __m128i*)lut[mask]));
+        }
+        template <class T, class A, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> from_mask(batch_bool<T, A> const&, uint64_t mask, requires_arch<sse2>) noexcept
+        {
+            alignas(A::alignment()) static const uint64_t lut64[] = {
+                0x0000000000000000,
+                0x000000000000FFFF,
+                0x00000000FFFF0000,
+                0x00000000FFFFFFFF,
+                0x0000FFFF00000000,
+                0x0000FFFF0000FFFF,
+                0x0000FFFFFFFF0000,
+                0x0000FFFFFFFFFFFF,
+                0xFFFF000000000000,
+                0xFFFF00000000FFFF,
+                0xFFFF0000FFFF0000,
+                0xFFFF0000FFFFFFFF,
+                0xFFFFFFFF00000000,
+                0xFFFFFFFF0000FFFF,
+                0xFFFFFFFFFFFF0000,
+                0xFFFFFFFFFFFFFFFF,
+            };
+            alignas(A::alignment()) static const uint32_t lut32[] = {
+                0x00000000,
+                0x000000FF,
+                0x0000FF00,
+                0x0000FFFF,
+                0x00FF0000,
+                0x00FF00FF,
+                0x00FFFF00,
+                0x00FFFFFF,
+                0xFF000000,
+                0xFF0000FF,
+                0xFF00FF00,
+                0xFF00FFFF,
+                0xFFFF0000,
+                0xFFFF00FF,
+                0xFFFFFF00,
+                0xFFFFFFFF,
+            };
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                assert(!(mask & ~0xFFFF) && "inbound mask");
+                return _mm_setr_epi32(lut32[mask & 0xF], lut32[(mask >> 4) & 0xF], lut32[(mask >> 8) & 0xF], lut32[mask >> 12]);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                assert(!(mask & ~0xFF) && "inbound mask");
+                return _mm_set_epi64x(lut64[mask >> 4], lut64[mask & 0xF]);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm_castps_si128(from_mask(batch_bool<float, A> {}, mask, sse2 {}));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm_castpd_si128(from_mask(batch_bool<double, A> {}, mask, sse2 {}));
+            }
+        }
+
+        // ge
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> ge(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmpge_ps(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> ge(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmpge_pd(self, other);
+        }
+
+        // gt
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> gt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmpgt_ps(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm_cmpgt_epi8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm_cmpgt_epi16(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm_cmpgt_epi32(self, other);
+                }
+                else
+                {
+                    return gt(self, other, common {});
+                }
+            }
+            else
+            {
+                return gt(self, other, common {});
+            }
+        }
+
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> gt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmpgt_pd(self, other);
+        }
+
+        // haddp
+        template <class A>
+        XSIMD_INLINE batch<float, A> haddp(batch<float, A> const* row, requires_arch<sse2>) noexcept
+        {
+            __m128 tmp0 = _mm_unpacklo_ps(row[0], row[1]);
+            __m128 tmp1 = _mm_unpackhi_ps(row[0], row[1]);
+            __m128 tmp2 = _mm_unpackhi_ps(row[2], row[3]);
+            tmp0 = _mm_add_ps(tmp0, tmp1);
+            tmp1 = _mm_unpacklo_ps(row[2], row[3]);
+            tmp1 = _mm_add_ps(tmp1, tmp2);
+            tmp2 = _mm_movehl_ps(tmp1, tmp0);
+            tmp0 = _mm_movelh_ps(tmp0, tmp1);
+            return _mm_add_ps(tmp0, tmp2);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> haddp(batch<double, A> const* row, requires_arch<sse2>) noexcept
+        {
+            return _mm_add_pd(_mm_unpacklo_pd(row[0], row[1]),
+                              _mm_unpackhi_pd(row[0], row[1]));
+        }
+
+        // incr_if
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> incr_if(batch<T, A> const& self, batch_bool<T, A> const& mask, requires_arch<sse2>) noexcept
+        {
+            return self - batch<T, A>(mask.data);
+        }
+
+        // insert
+        template <class A, class T, size_t I, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I> pos, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm_insert_epi16(self, val, I);
+            }
+            else
+            {
+                return insert(self, val, pos, common {});
+            }
+        }
+
+        // isnan
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> isnan(batch<float, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmpunord_ps(self, self);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> isnan(batch<double, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmpunord_pd(self, self);
+        }
+
+        // load_aligned
+        template <class A>
+        XSIMD_INLINE batch<float, A> load_aligned(float const* mem, convert<float>, requires_arch<sse2>) noexcept
+        {
+            return _mm_load_ps(mem);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> load_aligned(T const* mem, convert<T>, requires_arch<sse2>) noexcept
+        {
+            return _mm_load_si128((__m128i const*)mem);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> load_aligned(double const* mem, convert<double>, requires_arch<sse2>) noexcept
+        {
+            return _mm_load_pd(mem);
+        }
+
+        // load_unaligned
+        template <class A>
+        XSIMD_INLINE batch<float, A> load_unaligned(float const* mem, convert<float>, requires_arch<sse2>) noexcept
+        {
+            return _mm_loadu_ps(mem);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<sse2>) noexcept
+        {
+            return _mm_loadu_si128((__m128i const*)mem);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> load_unaligned(double const* mem, convert<double>, requires_arch<sse2>) noexcept
+        {
+            return _mm_loadu_pd(mem);
+        }
+
+        // load_complex
+        namespace detail
+        {
+            // Redefine these methods in the SSE-based archs if required
+            template <class A>
+            XSIMD_INLINE batch<std::complex<float>, A> load_complex(batch<float, A> const& hi, batch<float, A> const& lo, requires_arch<sse2>) noexcept
+            {
+                return { _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(2, 0, 2, 0)), _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(3, 1, 3, 1)) };
+            }
+            template <class A>
+            XSIMD_INLINE batch<std::complex<double>, A> load_complex(batch<double, A> const& hi, batch<double, A> const& lo, requires_arch<sse2>) noexcept
+            {
+                return { _mm_shuffle_pd(hi, lo, _MM_SHUFFLE2(0, 0)), _mm_shuffle_pd(hi, lo, _MM_SHUFFLE2(1, 1)) };
+            }
+        }
+
+        // le
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> le(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmple_ps(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> le(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmple_pd(self, other);
+        }
+
+        // lt
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> lt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmplt_ps(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm_cmplt_epi8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm_cmplt_epi16(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm_cmplt_epi32(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    __m128i tmp1 = _mm_sub_epi64(self, other);
+                    __m128i tmp2 = _mm_xor_si128(self, other);
+                    __m128i tmp3 = _mm_andnot_si128(other, self);
+                    __m128i tmp4 = _mm_andnot_si128(tmp2, tmp1);
+                    __m128i tmp5 = _mm_or_si128(tmp3, tmp4);
+                    __m128i tmp6 = _mm_srai_epi32(tmp5, 31);
+                    return _mm_shuffle_epi32(tmp6, 0xF5);
+                }
+                else
+                {
+                    assert(false && "unsupported arch/op combination");
+                    return {};
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm_cmplt_epi8(_mm_xor_si128(self, _mm_set1_epi8(std::numeric_limits<int8_t>::lowest())), _mm_xor_si128(other, _mm_set1_epi8(std::numeric_limits<int8_t>::lowest())));
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm_cmplt_epi16(_mm_xor_si128(self, _mm_set1_epi16(std::numeric_limits<int16_t>::lowest())), _mm_xor_si128(other, _mm_set1_epi16(std::numeric_limits<int16_t>::lowest())));
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm_cmplt_epi32(_mm_xor_si128(self, _mm_set1_epi32(std::numeric_limits<int32_t>::lowest())), _mm_xor_si128(other, _mm_set1_epi32(std::numeric_limits<int32_t>::lowest())));
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    auto xself = _mm_xor_si128(self, _mm_set1_epi64x(std::numeric_limits<int64_t>::lowest()));
+                    auto xother = _mm_xor_si128(other, _mm_set1_epi64x(std::numeric_limits<int64_t>::lowest()));
+                    __m128i tmp1 = _mm_sub_epi64(xself, xother);
+                    __m128i tmp2 = _mm_xor_si128(xself, xother);
+                    __m128i tmp3 = _mm_andnot_si128(xother, xself);
+                    __m128i tmp4 = _mm_andnot_si128(tmp2, tmp1);
+                    __m128i tmp5 = _mm_or_si128(tmp3, tmp4);
+                    __m128i tmp6 = _mm_srai_epi32(tmp5, 31);
+                    return _mm_shuffle_epi32(tmp6, 0xF5);
+                }
+                else
+                {
+                    assert(false && "unsupported arch/op combination");
+                    return {};
+                }
+            }
+        }
+
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> lt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmplt_pd(self, other);
+        }
+
+        /* compression table to turn 0b10 into 0b1,
+         * 0b100010 into 0b101 etc
+         */
+        namespace detail
+        {
+            XSIMD_INLINE int mask_lut(uint64_t mask)
+            {
+                // clang-format off
+                static const int mask_lut[256] = {
+                  0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x2, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0x4, 0x0, 0x5, 0x0, 0x0, 0x0, 0x0, 0x0, 0x6, 0x0, 0x7, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0x8, 0x0, 0x9, 0x0, 0x0, 0x0, 0x0, 0x0, 0xA, 0x0, 0xB, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0xC, 0x0, 0xD, 0x0, 0x0, 0x0, 0x0, 0x0, 0xE, 0x0, 0xF, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                };
+                // clang-format on
+                return mask_lut[mask & 0xAA];
+            }
+        }
+
+        // mask
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE uint64_t mask(batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm_movemask_epi8(self);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                uint64_t mask8 = _mm_movemask_epi8(self);
+                return detail::mask_lut(mask8) | (detail::mask_lut(mask8 >> 8) << 4);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm_movemask_ps(_mm_castsi128_ps(self));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm_movemask_pd(_mm_castsi128_pd(self));
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+        template <class A>
+        XSIMD_INLINE uint64_t mask(batch_bool<float, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_movemask_ps(self);
+        }
+
+        template <class A>
+        XSIMD_INLINE uint64_t mask(batch_bool<double, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_movemask_pd(self);
+        }
+
+        // max
+        template <class A>
+        XSIMD_INLINE batch<float, A> max(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_max_ps(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return select(self > other, self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> max(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_max_pd(self, other);
+        }
+
+        // min
+        template <class A>
+        XSIMD_INLINE batch<float, A> min(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_min_ps(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return select(self <= other, self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> min(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_min_pd(self, other);
+        }
+
+        // mul
+        template <class A>
+        XSIMD_INLINE batch<float, A> mul(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_mul_ps(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> mul(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_mul_pd(self, other);
+        }
+
+        // mul
+        template <class A>
+        XSIMD_INLINE batch<int16_t, A> mul(batch<int16_t, A> const& self, batch<int16_t, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_mullo_epi16(self, other);
+        }
+
+        // nearbyint_as_int
+        template <class A>
+        XSIMD_INLINE batch<int32_t, A> nearbyint_as_int(batch<float, A> const& self,
+                                                        requires_arch<sse2>) noexcept
+        {
+            return _mm_cvtps_epi32(self);
+        }
+
+        // neg
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return 0 - self;
+        }
+        template <class A>
+        XSIMD_INLINE batch<float, A> neg(batch<float, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_ps(self, _mm_castsi128_ps(_mm_set1_epi32(0x80000000)));
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> neg(batch<double, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_pd(
+                self, _mm_castsi128_pd(_mm_setr_epi32(0, 0x80000000, 0, 0x80000000)));
+        }
+
+        // neq
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> neq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmpneq_ps(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return ~(self == other);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> neq(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_ps(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> neq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(self.data), _mm_castsi128_ps(other.data)));
+        }
+
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> neq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmpneq_pd(self, other);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> neq(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_pd(self, other);
+        }
+
+        // reciprocal
+        template <class A>
+        XSIMD_INLINE batch<float, A> reciprocal(batch<float, A> const& self,
+                                                kernel::requires_arch<sse2>)
+        {
+            return _mm_rcp_ps(self);
+        }
+
+        // reduce_add
+        template <class A>
+        XSIMD_INLINE float reduce_add(batch<float, A> const& self, requires_arch<sse2>) noexcept
+        {
+            __m128 tmp0 = _mm_add_ps(self, _mm_movehl_ps(self, self));
+            __m128 tmp1 = _mm_add_ss(tmp0, _mm_shuffle_ps(tmp0, tmp0, 1));
+            return _mm_cvtss_f32(tmp1);
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                __m128i tmp1 = _mm_shuffle_epi32(self, 0x0E);
+                __m128i tmp2 = _mm_add_epi32(self, tmp1);
+                __m128i tmp3 = _mm_shuffle_epi32(tmp2, 0x01);
+                __m128i tmp4 = _mm_add_epi32(tmp2, tmp3);
+                return _mm_cvtsi128_si32(tmp4);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                __m128i tmp1 = _mm_shuffle_epi32(self, 0x0E);
+                __m128i tmp2 = _mm_add_epi64(self, tmp1);
+#if defined(__x86_64__)
+                return _mm_cvtsi128_si64(tmp2);
+#else
+                __m128i m;
+                _mm_storel_epi64(&m, tmp2);
+                int64_t i;
+                std::memcpy(&i, &m, sizeof(i));
+                return i;
+#endif
+            }
+            else
+            {
+                return hadd(self, common {});
+            }
+        }
+
+        template <class A>
+        XSIMD_INLINE double reduce_add(batch<double, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_cvtsd_f64(_mm_add_sd(self, _mm_unpackhi_pd(self, self)));
+        }
+
+        // reduce_max
+        template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
+        XSIMD_INLINE T reduce_max(batch<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            constexpr auto mask0 = detail::shuffle(2, 3, 0, 0);
+            batch<T, A> step0 = _mm_shuffle_epi32(self, mask0);
+            batch<T, A> acc0 = max(self, step0);
+
+            constexpr auto mask1 = detail::shuffle(1, 0, 0, 0);
+            batch<T, A> step1 = _mm_shuffle_epi32(acc0, mask1);
+            batch<T, A> acc1 = max(acc0, step1);
+
+            constexpr auto mask2 = detail::shuffle(1, 0, 0, 0);
+            batch<T, A> step2 = _mm_shufflelo_epi16(acc1, mask2);
+            batch<T, A> acc2 = max(acc1, step2);
+            if (sizeof(T) == 2)
+                return first(acc2, A {});
+            batch<T, A> step3 = bitwise_cast<T>(bitwise_cast<uint16_t>(acc2) >> 8);
+            batch<T, A> acc3 = max(acc2, step3);
+            return first(acc3, A {});
+        }
+
+        // reduce_min
+        template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
+        XSIMD_INLINE T reduce_min(batch<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            constexpr auto mask0 = detail::shuffle(2, 3, 0, 0);
+            batch<T, A> step0 = _mm_shuffle_epi32(self, mask0);
+            batch<T, A> acc0 = min(self, step0);
+
+            constexpr auto mask1 = detail::shuffle(1, 0, 0, 0);
+            batch<T, A> step1 = _mm_shuffle_epi32(acc0, mask1);
+            batch<T, A> acc1 = min(acc0, step1);
+
+            constexpr auto mask2 = detail::shuffle(1, 0, 0, 0);
+            batch<T, A> step2 = _mm_shufflelo_epi16(acc1, mask2);
+            batch<T, A> acc2 = min(acc1, step2);
+            if (sizeof(T) == 2)
+                return first(acc2, A {});
+            batch<T, A> step3 = bitwise_cast<T>(bitwise_cast<uint16_t>(acc2) >> 8);
+            batch<T, A> acc3 = min(acc2, step3);
+            return first(acc3, A {});
+        }
+
+        // rsqrt
+        template <class A>
+        XSIMD_INLINE batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<sse2>) noexcept
+        {
+            return _mm_rsqrt_ps(val);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> rsqrt(batch<double, A> const& val, requires_arch<sse2>) noexcept
+        {
+            return _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(val)));
+        }
+
+        // select
+        template <class A>
+        XSIMD_INLINE batch<float, A> select(batch_bool<float, A> const& cond, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<sse2>) noexcept
+        {
+            return _mm_or_ps(_mm_and_ps(cond, true_br), _mm_andnot_ps(cond, false_br));
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sse2>) noexcept
+        {
+            return _mm_or_si128(_mm_and_si128(cond, true_br), _mm_andnot_si128(cond, false_br));
+        }
+        template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sse2>) noexcept
+        {
+            return select(batch_bool<T, A> { Values... }, true_br, false_br, sse2 {});
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<sse2>) noexcept
+        {
+            return _mm_or_pd(_mm_and_pd(cond, true_br), _mm_andnot_pd(cond, false_br));
+        }
+
+        // shuffle
+        template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3>
+        XSIMD_INLINE batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y, batch_constant<ITy, A, I0, I1, I2, I3> mask, requires_arch<sse2>) noexcept
+        {
+            constexpr uint32_t smask = detail::mod_shuffle(I0, I1, I2, I3);
+            // shuffle within lane
+            if (I0 < 4 && I1 < 4 && I2 >= 4 && I3 >= 4)
+                return _mm_shuffle_ps(x, y, smask);
+
+            // shuffle within opposite lane
+            if (I0 >= 4 && I1 >= 4 && I2 < 4 && I3 < 4)
+                return _mm_shuffle_ps(y, x, smask);
+            return shuffle(x, y, mask, common {});
+        }
+
+        template <class A, class ITy, ITy I0, ITy I1>
+        XSIMD_INLINE batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<ITy, A, I0, I1> mask, requires_arch<sse2>) noexcept
+        {
+            constexpr uint32_t smask = detail::mod_shuffle(I0, I1);
+            // shuffle within lane
+            if (I0 < 2 && I1 >= 2)
+                return _mm_shuffle_pd(x, y, smask);
+
+            // shuffle within opposite lane
+            if (I0 >= 2 && I1 < 2)
+                return _mm_shuffle_pd(y, x, smask);
+            return shuffle(x, y, mask, common {});
+        }
+
+        // sqrt
+        template <class A>
+        XSIMD_INLINE batch<float, A> sqrt(batch<float, A> const& val, requires_arch<sse2>) noexcept
+        {
+            return _mm_sqrt_ps(val);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> sqrt(batch<double, A> const& val, requires_arch<sse2>) noexcept
+        {
+            return _mm_sqrt_pd(val);
+        }
+
+        // slide_left
+        template <size_t N, class A, class T>
+        XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& x, requires_arch<sse2>) noexcept
+        {
+            return _mm_slli_si128(x, N);
+        }
+
+        // slide_right
+        template <size_t N, class A, class T>
+        XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& x, requires_arch<sse2>) noexcept
+        {
+            return _mm_srli_si128(x, N);
+        }
+
+        // sadd
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm_adds_epi8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm_adds_epi16(self, other);
+                }
+                else
+                {
+                    return sadd(self, other, common {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm_adds_epu8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm_adds_epu16(self, other);
+                }
+                else
+                {
+                    return sadd(self, other, common {});
+                }
+            }
+        }
+
+        // set
+        template <class A, class... Values>
+        XSIMD_INLINE batch<float, A> set(batch<float, A> const&, requires_arch<sse2>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch<float, A>::size, "consistent init");
+            return _mm_setr_ps(values...);
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<sse2>, T v0, T v1) noexcept
+        {
+            return _mm_set_epi64x(v1, v0);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<sse2>, T v0, T v1, T v2, T v3) noexcept
+        {
+            return _mm_setr_epi32(v0, v1, v2, v3);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<sse2>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) noexcept
+        {
+            return _mm_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<sse2>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) noexcept
+        {
+            return _mm_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
+        }
+
+        template <class A, class... Values>
+        XSIMD_INLINE batch<double, A> set(batch<double, A> const&, requires_arch<sse2>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch<double, A>::size, "consistent init");
+            return _mm_setr_pd(values...);
+        }
+
+        template <class A, class T, class... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<sse2>, Values... values) noexcept
+        {
+            return set(batch<T, A>(), A {}, static_cast<T>(values ? -1LL : 0LL)...).data;
+        }
+
+        template <class A, class... Values>
+        XSIMD_INLINE batch_bool<float, A> set(batch_bool<float, A> const&, requires_arch<sse2>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch_bool<float, A>::size, "consistent init");
+            return _mm_castsi128_ps(set(batch<int32_t, A>(), A {}, static_cast<int32_t>(values ? -1LL : 0LL)...).data);
+        }
+
+        template <class A, class... Values>
+        XSIMD_INLINE batch_bool<double, A> set(batch_bool<double, A> const&, requires_arch<sse2>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch_bool<double, A>::size, "consistent init");
+            return _mm_castsi128_pd(set(batch<int64_t, A>(), A {}, static_cast<int64_t>(values ? -1LL : 0LL)...).data);
+        }
+
+        // ssub
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm_subs_epi8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm_subs_epi16(self, other);
+                }
+                else
+                {
+                    return ssub(self, other, common {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm_subs_epu8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm_subs_epu16(self, other);
+                }
+                else
+                {
+                    return ssub(self, other, common {});
+                }
+            }
+        }
+
+        // store_aligned
+        template <class A>
+        XSIMD_INLINE void store_aligned(float* mem, batch<float, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_store_ps(mem, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE void store_aligned(T* mem, batch<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_store_si128((__m128i*)mem, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE void store_aligned(T* mem, batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_store_si128((__m128i*)mem, self);
+        }
+        template <class A>
+        XSIMD_INLINE void store_aligned(double* mem, batch<double, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_store_pd(mem, self);
+        }
+
+        // store_unaligned
+        template <class A>
+        XSIMD_INLINE void store_unaligned(float* mem, batch<float, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_storeu_ps(mem, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE void store_unaligned(T* mem, batch<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_storeu_si128((__m128i*)mem, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE void store_unaligned(T* mem, batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_storeu_si128((__m128i*)mem, self);
+        }
+        template <class A>
+        XSIMD_INLINE void store_unaligned(double* mem, batch<double, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_storeu_pd(mem, self);
+        }
+
+        // sub
+        template <class A>
+        XSIMD_INLINE batch<float, A> sub(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_sub_ps(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm_sub_epi8(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm_sub_epi16(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm_sub_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm_sub_epi64(self, other);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> sub(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_sub_pd(self, other);
+        }
+
+        // swizzle
+
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
+        XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3>, requires_arch<sse2>) noexcept
+        {
+            constexpr uint32_t index = detail::shuffle(V0, V1, V2, V3);
+            return _mm_shuffle_ps(self, self, index);
+        }
+
+        template <class A, uint64_t V0, uint64_t V1>
+        XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1>, requires_arch<sse2>) noexcept
+        {
+            constexpr uint32_t index = detail::shuffle(V0, V1);
+            return _mm_shuffle_pd(self, self, index);
+        }
+
+        template <class A, uint64_t V0, uint64_t V1>
+        XSIMD_INLINE batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<uint64_t, A, V0, V1>, requires_arch<sse2>) noexcept
+        {
+            constexpr uint32_t index = detail::shuffle(2 * V0, 2 * V0 + 1, 2 * V1, 2 * V1 + 1);
+            return _mm_shuffle_epi32(self, index);
+        }
+
+        template <class A, uint64_t V0, uint64_t V1>
+        XSIMD_INLINE batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<uint64_t, A, V0, V1> mask, requires_arch<sse2>) noexcept
+        {
+            return bitwise_cast<int64_t>(swizzle(bitwise_cast<uint64_t>(self), mask, sse2 {}));
+        }
+
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
+        XSIMD_INLINE batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3>, requires_arch<sse2>) noexcept
+        {
+            constexpr uint32_t index = detail::shuffle(V0, V1, V2, V3);
+            return _mm_shuffle_epi32(self, index);
+        }
+
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
+        XSIMD_INLINE batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3> mask, requires_arch<sse2>) noexcept
+        {
+            return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, sse2 {}));
+        }
+
+        template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
+        XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<sse2>) noexcept
+        {
+            // permute within each lane
+            constexpr auto mask_lo = detail::mod_shuffle(V0, V1, V2, V3);
+            constexpr auto mask_hi = detail::mod_shuffle(V4, V5, V6, V7);
+            __m128i lo = _mm_shufflelo_epi16(self, mask_lo);
+            __m128i hi = _mm_shufflehi_epi16(self, mask_hi);
+
+            __m128i lo_lo = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(lo), _mm_castsi128_pd(lo), _MM_SHUFFLE2(0, 0)));
+            __m128i hi_hi = _mm_castpd_si128(_mm_shuffle_pd(_mm_castsi128_pd(hi), _mm_castsi128_pd(hi), _MM_SHUFFLE2(1, 1)));
+
+            // mask to choose the right lane
+            batch_bool_constant<uint16_t, A, (V0 < 4), (V1 < 4), (V2 < 4), (V3 < 4), (V4 < 4), (V5 < 4), (V6 < 4), (V7 < 4)> blend_mask;
+
+            // blend the two permutes
+            return select(blend_mask, batch<uint16_t, A>(lo_lo), batch<uint16_t, A>(hi_hi));
+        }
+
+        template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
+        XSIMD_INLINE batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<sse2>) noexcept
+        {
+            return bitwise_cast<int16_t>(swizzle(bitwise_cast<uint16_t>(self), mask, sse2 {}));
+        }
+
+        // transpose
+        template <class A>
+        XSIMD_INLINE void transpose(batch<float, A>* matrix_begin, batch<float, A>* matrix_end, requires_arch<sse2>) noexcept
+        {
+            assert((matrix_end - matrix_begin == batch<float, A>::size) && "correctly sized matrix");
+            (void)matrix_end;
+            auto r0 = matrix_begin[0], r1 = matrix_begin[1], r2 = matrix_begin[2], r3 = matrix_begin[3];
+            _MM_TRANSPOSE4_PS(r0, r1, r2, r3);
+            matrix_begin[0] = r0;
+            matrix_begin[1] = r1;
+            matrix_begin[2] = r2;
+            matrix_begin[3] = r3;
+        }
+        template <class A>
+        XSIMD_INLINE void transpose(batch<uint32_t, A>* matrix_begin, batch<uint32_t, A>* matrix_end, requires_arch<sse2>) noexcept
+        {
+            transpose(reinterpret_cast<batch<float, A>*>(matrix_begin), reinterpret_cast<batch<float, A>*>(matrix_end), A {});
+        }
+        template <class A>
+        XSIMD_INLINE void transpose(batch<int32_t, A>* matrix_begin, batch<int32_t, A>* matrix_end, requires_arch<sse2>) noexcept
+        {
+            transpose(reinterpret_cast<batch<float, A>*>(matrix_begin), reinterpret_cast<batch<float, A>*>(matrix_end), A {});
+        }
+
+        template <class A>
+        XSIMD_INLINE void transpose(batch<double, A>* matrix_begin, batch<double, A>* matrix_end, requires_arch<sse2>) noexcept
+        {
+            assert((matrix_end - matrix_begin == batch<double, A>::size) && "correctly sized matrix");
+            (void)matrix_end;
+            auto r0 = matrix_begin[0], r1 = matrix_begin[1];
+            matrix_begin[0] = _mm_unpacklo_pd(r0, r1);
+            matrix_begin[1] = _mm_unpackhi_pd(r0, r1);
+        }
+        template <class A>
+        XSIMD_INLINE void transpose(batch<uint64_t, A>* matrix_begin, batch<uint64_t, A>* matrix_end, requires_arch<sse2>) noexcept
+        {
+            transpose(reinterpret_cast<batch<double, A>*>(matrix_begin), reinterpret_cast<batch<double, A>*>(matrix_end), A {});
+        }
+        template <class A>
+        XSIMD_INLINE void transpose(batch<int64_t, A>* matrix_begin, batch<int64_t, A>* matrix_end, requires_arch<sse2>) noexcept
+        {
+            transpose(reinterpret_cast<batch<double, A>*>(matrix_begin), reinterpret_cast<batch<double, A>*>(matrix_end), A {});
+        }
+
+        // zip_hi
+        template <class A>
+        XSIMD_INLINE batch<float, A> zip_hi(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_unpackhi_ps(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm_unpackhi_epi8(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm_unpackhi_epi16(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm_unpackhi_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm_unpackhi_epi64(self, other);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> zip_hi(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_unpackhi_pd(self, other);
+        }
+
+        // zip_lo
+        template <class A>
+        XSIMD_INLINE batch<float, A> zip_lo(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_unpacklo_ps(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm_unpacklo_epi8(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm_unpacklo_epi16(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm_unpacklo_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm_unpacklo_epi64(self, other);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> zip_lo(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_unpacklo_pd(self, other);
+        }
+#endif
+    }
+}
+
+#endif
diff --git a/include/xsimd/arch/xsimd_isa.hpp b/include/xsimd/arch/xsimd_isa.hpp
index f88d94f93..0428c8f7c 100644
--- a/include/xsimd/arch/xsimd_isa.hpp
+++ b/include/xsimd/arch/xsimd_isa.hpp
@@ -128,6 +128,10 @@
 #include "./xsimd_wasm.hpp"
 #endif
 
+#if XSIMD_WITH_ALTIVEC
+#include "./xsimd_altivec.hpp"
+#endif
+
 // Must come last to have access to all conversion specializations.
 #include "./xsimd_common.hpp"
 
diff --git a/include/xsimd/config/xsimd_config.hpp b/include/xsimd/config/xsimd_config.hpp
index 326f766c4..98d603647 100644
--- a/include/xsimd/config/xsimd_config.hpp
+++ b/include/xsimd/config/xsimd_config.hpp
@@ -408,6 +408,17 @@
 #define XSIMD_WITH_WASM 0
 #endif
 
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if Altivec is available at compile-time, to 0 otherwise.
+ */
+#ifdef __VEC__
+#define XSIMD_WITH_ALTIVEC 1
+#else
+#define XSIMD_WITH_ALTIVEC 0
+#endif
+
 // Workaround for MSVC compiler
 #ifdef _MSC_VER
 
@@ -466,7 +477,7 @@
 
 #endif
 
-#if !XSIMD_WITH_SSE2 && !XSIMD_WITH_SSE3 && !XSIMD_WITH_SSSE3 && !XSIMD_WITH_SSE4_1 && !XSIMD_WITH_SSE4_2 && !XSIMD_WITH_AVX && !XSIMD_WITH_AVX2 && !XSIMD_WITH_AVXVNNI && !XSIMD_WITH_FMA3_SSE && !XSIMD_WITH_FMA4 && !XSIMD_WITH_FMA3_AVX && !XSIMD_WITH_FMA3_AVX2 && !XSIMD_WITH_AVX512F && !XSIMD_WITH_AVX512CD && !XSIMD_WITH_AVX512DQ && !XSIMD_WITH_AVX512BW && !XSIMD_WITH_AVX512ER && !XSIMD_WITH_AVX512PF && !XSIMD_WITH_AVX512IFMA && !XSIMD_WITH_AVX512VBMI && !XSIMD_WITH_AVX512VBMI2 && !XSIMD_WITH_NEON && !XSIMD_WITH_NEON64 && !XSIMD_WITH_SVE && !XSIMD_WITH_RVV && !XSIMD_WITH_WASM && !XSIMD_WITH_EMULATED
+#if !XSIMD_WITH_SSE2 && !XSIMD_WITH_SSE3 && !XSIMD_WITH_SSSE3 && !XSIMD_WITH_SSE4_1 && !XSIMD_WITH_SSE4_2 && !XSIMD_WITH_AVX && !XSIMD_WITH_AVX2 && !XSIMD_WITH_AVXVNNI && !XSIMD_WITH_FMA3_SSE && !XSIMD_WITH_FMA4 && !XSIMD_WITH_FMA3_AVX && !XSIMD_WITH_FMA3_AVX2 && !XSIMD_WITH_AVX512F && !XSIMD_WITH_AVX512CD && !XSIMD_WITH_AVX512DQ && !XSIMD_WITH_AVX512BW && !XSIMD_WITH_AVX512ER && !XSIMD_WITH_AVX512PF && !XSIMD_WITH_AVX512IFMA && !XSIMD_WITH_AVX512VBMI && !XSIMD_WITH_AVX512VBMI2 && !XSIMD_WITH_NEON && !XSIMD_WITH_NEON64 && !XSIMD_WITH_SVE && !XSIMD_WITH_RVV && !XSIMD_WITH_WASM && !XSIMD_WITH_ALTIVEC && !XSIMD_WITH_EMULATED
 #define XSIMD_NO_SUPPORTED_ARCHITECTURE
 #endif
 
diff --git a/include/xsimd/types/xsimd_all_registers.hpp b/include/xsimd/types/xsimd_all_registers.hpp
index a652061a8..9d72454dd 100644
--- a/include/xsimd/types/xsimd_all_registers.hpp
+++ b/include/xsimd/types/xsimd_all_registers.hpp
@@ -48,6 +48,8 @@
 
 #include "xsimd_wasm_register.hpp"
 
+#include "xsimd_altivec_register.hpp"
+
 #if XSIMD_WITH_EMULATED
 #include "xsimd_emulated_register.hpp"
 #endif
diff --git a/include/xsimd/types/xsimd_altivec_register.hpp b/include/xsimd/types/xsimd_altivec_register.hpp
new file mode 100644
index 000000000..0ec59ac17
--- /dev/null
+++ b/include/xsimd/types/xsimd_altivec_register.hpp
@@ -0,0 +1,57 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_ALTIVEC_REGISTER_HPP
+#define XSIMD_ALTIVEC_REGISTER_HPP
+
+#include "./xsimd_common_arch.hpp"
+#include "./xsimd_register.hpp"
+
+#if XSIMD_WITH_ALTIVEC
+#include <altivec.h>
+#endif
+
+namespace xsimd
+{
+    /**
+     * @ingroup architectures
+     *
+     * Altivec instructions
+     */
+    struct altivec : common
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_ALTIVEC; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr bool requires_alignment() noexcept { return true; }
+        static constexpr std::size_t alignment() noexcept { return 16; }
+        static constexpr char const* name() noexcept { return "altivec"; }
+    };
+
+#if XSIMD_WITH_ALTIVEC
+    namespace types
+    {
+        XSIMD_DECLARE_SIMD_REGISTER(signed char, altivec, vector signed char);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned char, altivec, vector unsigned char);
+        XSIMD_DECLARE_SIMD_REGISTER(char, altivec, vecroe char);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned short, altivec, vector unsigned short);
+        XSIMD_DECLARE_SIMD_REGISTER(short, altivec, vector short);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned int, altivec, vector unsigned int);
+        XSIMD_DECLARE_SIMD_REGISTER(int, altivec, vector int);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, altivec, vector unsigned long);
+        XSIMD_DECLARE_SIMD_REGISTER(long int, altivec, vector long);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, altivec, vector unsigned long long);
+        XSIMD_DECLARE_SIMD_REGISTER(long long int, altivec, vector long long);
+        XSIMD_DECLARE_SIMD_REGISTER(float, altivec, vector float);
+    }
+#endif
+}
+
+#endif

From a8da6516c4b1f1d24cfde056123d1385126fd673 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Sun, 13 Jul 2025 13:28:47 +0200
Subject: [PATCH 02/57] minimal test

---
 test/CMakeLists.txt | 72 ++++++++++++++++++++++-----------------------
 1 file changed, 36 insertions(+), 36 deletions(-)

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 8a4ce50d5..bf26edcb4 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -127,42 +127,42 @@ endif()
 
 set(XSIMD_TESTS
     main.cpp
-    test_api.cpp
-    test_arch.cpp
-    test_basic_math.cpp
-    test_batch.cpp
-    test_batch_bool.cpp
-    test_batch_cast.cpp
-    test_batch_complex.cpp
-    test_batch_float.cpp
-    test_batch_int.cpp
-    test_bitwise_cast.cpp
-    test_batch_constant.cpp
-    test_batch_manip.cpp
-    test_complex_exponential.cpp
-    test_complex_hyperbolic.cpp
-    test_complex_power.cpp
-    test_complex_trigonometric.cpp
-    test_conversion.cpp
-    test_custom_default_arch.cpp
-    test_error_gamma.cpp
-    test_explicit_batch_instantiation.cpp
-    test_exponential.cpp
-    test_extract_pair.cpp
-    test_fp_manipulation.cpp
-    test_hyperbolic.cpp
-    test_load_store.cpp
-    test_memory.cpp
-    test_poly_evaluation.cpp
-    test_power.cpp
-    test_rounding.cpp
-    test_select.cpp
-    test_shuffle.cpp
-    test_sum.cpp
-    test_traits.cpp
-    test_trigonometric.cpp
-    test_xsimd_api.cpp
-    test_utils.hpp
+    #    test_api.cpp
+    #    test_arch.cpp
+    #    test_basic_math.cpp
+    #    test_batch.cpp
+    #    test_batch_bool.cpp
+    #    test_batch_cast.cpp
+    #    test_batch_complex.cpp
+    #    test_batch_float.cpp
+    #    test_batch_int.cpp
+    #    test_bitwise_cast.cpp
+    #    test_batch_constant.cpp
+    #    test_batch_manip.cpp
+    #    test_complex_exponential.cpp
+    #    test_complex_hyperbolic.cpp
+    #    test_complex_power.cpp
+    #    test_complex_trigonometric.cpp
+    #    test_conversion.cpp
+    #    test_custom_default_arch.cpp
+    #    test_error_gamma.cpp
+    #    test_explicit_batch_instantiation.cpp
+    #    test_exponential.cpp
+    #    test_extract_pair.cpp
+    #    test_fp_manipulation.cpp
+    #    test_hyperbolic.cpp
+    #    test_load_store.cpp
+    #    test_memory.cpp
+    #    test_poly_evaluation.cpp
+    #    test_power.cpp
+    #    test_rounding.cpp
+    #    test_select.cpp
+    #    test_shuffle.cpp
+    #    test_sum.cpp
+    #    test_traits.cpp
+    #    test_trigonometric.cpp
+    #    test_xsimd_api.cpp
+    #    test_utils.hpp
 )
 
 if(NOT MSVC)

From 16838236ee83f158ce0f70ff6365e83c3b1600e4 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Sun, 13 Jul 2025 13:41:01 +0200
Subject: [PATCH 03/57] add ppc toolchain description

---
 .../toolchains/gcc-powerpc-linux-gnu.cmake    |  5 +++
 .../workflows/{cross.yml => cross-arm.yml}    |  1 -
 .github/workflows/cross-ppc.yml               | 43 +++++++++++++++++++
 .../xsimd/types/xsimd_altivec_register.hpp    | 20 ++++-----
 test/CMakeLists.txt                           |  4 +-
 5 files changed, 59 insertions(+), 14 deletions(-)
 create mode 100644 .github/toolchains/gcc-powerpc-linux-gnu.cmake
 rename .github/workflows/{cross.yml => cross-arm.yml} (96%)
 create mode 100644 .github/workflows/cross-ppc.yml

diff --git a/.github/toolchains/gcc-powerpc-linux-gnu.cmake b/.github/toolchains/gcc-powerpc-linux-gnu.cmake
new file mode 100644
index 000000000..a318f6412
--- /dev/null
+++ b/.github/toolchains/gcc-powerpc-linux-gnu.cmake
@@ -0,0 +1,5 @@
+set(CMAKE_SYSTEM_PROCESSOR powerpc)
+set(triple powerpc-linux-gnu)
+
+include(${CMAKE_CURRENT_LIST_DIR}/gcc.cmake)
+
diff --git a/.github/workflows/cross.yml b/.github/workflows/cross-arm.yml
similarity index 96%
rename from .github/workflows/cross.yml
rename to .github/workflows/cross-arm.yml
index e71096f02..071e85f25 100644
--- a/.github/workflows/cross.yml
+++ b/.github/workflows/cross-arm.yml
@@ -13,7 +13,6 @@ jobs:
           - { platform: 'arm',     arch: 'armv7-a', dir: 'arm-linux-gnueabihf', flags: '-mfpu=neon', full: 'ON'}
           - { platform: 'arm',     arch: 'armv7-a', dir: 'arm-linux-gnueabihf', flags: '-mfpu=vfpv3-d16', full: 'OFF' } # no neon
           - { platform: 'aarch64', arch: 'armv8-a', dir: 'aarch64-linux-gnu', flags: '', full: 'ON' }
-          - { platform: 'ppc', arch: 'powerpc', dir: 'powerpc-linux-gnu', flags: '-maltivec', full: 'OFF' }
         sys:
           - { compiler: 'gcc',   version: '10' }
     steps:
diff --git a/.github/workflows/cross-ppc.yml b/.github/workflows/cross-ppc.yml
new file mode 100644
index 000000000..f63383a8f
--- /dev/null
+++ b/.github/workflows/cross-ppc.yml
@@ -0,0 +1,43 @@
+name: PowerPC cross-compilation build
+on: [push, pull_request]
+concurrency:
+  group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }}
+  cancel-in-progress: true
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    name: '${{ matrix.target.arch }}, ${{ matrix.sys.compiler }} ${{ matrix.sys.version }}'
+    strategy:
+      matrix:
+        target:
+          - { platform: 'ppc',     dir: 'powerpc-linux-gnu',   flags: '-maltivec', full: 'OFF' }
+        sys:
+          - { compiler: 'gcc',   version: '10' }
+    steps:
+    - name: Setup compiler
+      if: ${{ matrix.sys.compiler == 'gcc' }}
+      run: |
+        sudo apt-get update || exit 1
+        sudo apt-get --no-install-suggests --no-install-recommends install g++-${{ matrix.sys.version }}-${{ matrix.target.dir }} g++-${{ matrix.sys.version }}-multilib || exit 1
+        sudo update-alternatives --remove-all ${{ matrix.target.dir }}-gcc || true
+        sudo update-alternatives --remove-all ${{ matrix.target.dir }}-g++ || true
+        sudo update-alternatives --install /usr/bin/${{ matrix.target.dir }}-gcc ${{ matrix.target.dir }}-gcc /usr/bin/${{ matrix.target.dir }}-gcc-${{ matrix.sys.version }} 20
+        sudo update-alternatives --install /usr/bin/${{ matrix.target.dir }}-g++ ${{ matrix.target.dir }}-g++ /usr/bin/${{ matrix.target.dir }}-g++-${{ matrix.sys.version }} 20
+    - name: Setup QEMU
+      run: |
+        sudo apt-get --no-install-suggests --no-install-recommends install qemu-user
+    - name: Setup Ninja
+      run: |
+        sudo apt-get install ninja-build
+    - name: Checkout xsimd
+      uses: actions/checkout@v3
+    - name: Setup
+      run: |
+        mkdir _build
+        cd _build && cmake .. -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DBUILD_BENCHMARK=${{ matrix.target.full }} -DBUILD_EXAMPLES=${{ matrix.target.full }} -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_FLAGS="${{ matrix.target.flags }}" -DCMAKE_CXX_FLAGS="${{ matrix.target.flags }}" -DCMAKE_TOOLCHAIN_FILE=${{ github.workspace }}/.github/toolchains/${{ matrix.sys.compiler }}-${{ matrix.target.dir }}.cmake
+    - name: Build
+      run: cmake --build _build --verbose
+    - name: Testing xsimd
+      run: |
+        qemu-${{ matrix.target.platform }} -cpu 7400 -L /usr/${{ matrix.target.dir}}/ ./test/test_xsimd
+      working-directory: ${{ github.workspace }}/_build
diff --git a/include/xsimd/types/xsimd_altivec_register.hpp b/include/xsimd/types/xsimd_altivec_register.hpp
index 0ec59ac17..52f896bf2 100644
--- a/include/xsimd/types/xsimd_altivec_register.hpp
+++ b/include/xsimd/types/xsimd_altivec_register.hpp
@@ -38,18 +38,14 @@ namespace xsimd
 #if XSIMD_WITH_ALTIVEC
     namespace types
     {
-        XSIMD_DECLARE_SIMD_REGISTER(signed char, altivec, vector signed char);
-        XSIMD_DECLARE_SIMD_REGISTER(unsigned char, altivec, vector unsigned char);
-        XSIMD_DECLARE_SIMD_REGISTER(char, altivec, vecroe char);
-        XSIMD_DECLARE_SIMD_REGISTER(unsigned short, altivec, vector unsigned short);
-        XSIMD_DECLARE_SIMD_REGISTER(short, altivec, vector short);
-        XSIMD_DECLARE_SIMD_REGISTER(unsigned int, altivec, vector unsigned int);
-        XSIMD_DECLARE_SIMD_REGISTER(int, altivec, vector int);
-        XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, altivec, vector unsigned long);
-        XSIMD_DECLARE_SIMD_REGISTER(long int, altivec, vector long);
-        XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, altivec, vector unsigned long long);
-        XSIMD_DECLARE_SIMD_REGISTER(long long int, altivec, vector long long);
-        XSIMD_DECLARE_SIMD_REGISTER(float, altivec, vector float);
+        XSIMD_DECLARE_SIMD_REGISTER(signed char, altivec, __vector signed char);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned char, altivec, __vector unsigned char);
+        XSIMD_DECLARE_SIMD_REGISTER(char, altivec, __vector char);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned short, altivec, __vector unsigned short);
+        XSIMD_DECLARE_SIMD_REGISTER(short, altivec, __vector short);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned int, altivec, __vector unsigned int);
+        XSIMD_DECLARE_SIMD_REGISTER(int, altivec, __vector int);
+        XSIMD_DECLARE_SIMD_REGISTER(float, altivec, __vector float);
     }
 #endif
 }
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index bf26edcb4..bdcab4b80 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -107,6 +107,8 @@ if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID MATCHES "GNU"
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mcpu=${TARGET_ARCH} -mtune=${TARGET_ARCH}")
     elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "riscv64")
         # Nothing specific
+    elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "powerpc")
+        # Nothing specific
     elseif(NOT WIN32 AND NOT EMSCRIPTEN)
         if(NOT CMAKE_CXX_FLAGS MATCHES "-march" AND NOT CMAKE_CXX_FLAGS MATCHES "-arch" AND NOT CMAKE_OSX_ARCHITECTURES)
             set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=${TARGET_ARCH}")
@@ -166,7 +168,7 @@ set(XSIMD_TESTS
 )
 
 if(NOT MSVC)
-    list(APPEND XSIMD_TESTS test_gnu_source.cpp)
+    #list(APPEND XSIMD_TESTS test_gnu_source.cpp)
 endif()
 
 add_executable(test_xsimd ${XSIMD_TESTS} ${XSIMD_HEADERS})

From ddfad22678996fd60c5d2cdc43720951a8415cb4 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Sun, 13 Jul 2025 15:26:11 +0200
Subject: [PATCH 04/57] + test_arch

---
 test/CMakeLists.txt | 74 ++++++++++++++++++++++-----------------------
 1 file changed, 37 insertions(+), 37 deletions(-)

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index bdcab4b80..e6bad7999 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -129,46 +129,46 @@ endif()
 
 set(XSIMD_TESTS
     main.cpp
-    #    test_api.cpp
-    #    test_arch.cpp
-    #    test_basic_math.cpp
-    #    test_batch.cpp
-    #    test_batch_bool.cpp
-    #    test_batch_cast.cpp
-    #    test_batch_complex.cpp
-    #    test_batch_float.cpp
-    #    test_batch_int.cpp
-    #    test_bitwise_cast.cpp
-    #    test_batch_constant.cpp
-    #    test_batch_manip.cpp
-    #    test_complex_exponential.cpp
-    #    test_complex_hyperbolic.cpp
-    #    test_complex_power.cpp
-    #    test_complex_trigonometric.cpp
-    #    test_conversion.cpp
-    #    test_custom_default_arch.cpp
-    #    test_error_gamma.cpp
-    #    test_explicit_batch_instantiation.cpp
-    #    test_exponential.cpp
-    #    test_extract_pair.cpp
-    #    test_fp_manipulation.cpp
-    #    test_hyperbolic.cpp
-    #    test_load_store.cpp
-    #    test_memory.cpp
-    #    test_poly_evaluation.cpp
-    #    test_power.cpp
-    #    test_rounding.cpp
-    #    test_select.cpp
-    #    test_shuffle.cpp
-    #    test_sum.cpp
-    #    test_traits.cpp
-    #    test_trigonometric.cpp
-    #    test_xsimd_api.cpp
-    #    test_utils.hpp
+    test_api.cpp
+    test_arch.cpp
+    test_basic_math.cpp
+    test_batch.cpp
+    test_batch_bool.cpp
+    test_batch_cast.cpp
+    test_batch_complex.cpp
+    test_batch_float.cpp
+    test_batch_int.cpp
+    test_bitwise_cast.cpp
+    test_batch_constant.cpp
+    test_batch_manip.cpp
+    test_complex_exponential.cpp
+    test_complex_hyperbolic.cpp
+    test_complex_power.cpp
+    test_complex_trigonometric.cpp
+    test_conversion.cpp
+    test_custom_default_arch.cpp
+    test_error_gamma.cpp
+    test_explicit_batch_instantiation.cpp
+    test_exponential.cpp
+    test_extract_pair.cpp
+    test_fp_manipulation.cpp
+    test_hyperbolic.cpp
+    test_load_store.cpp
+    test_memory.cpp
+    test_poly_evaluation.cpp
+    test_power.cpp
+    test_rounding.cpp
+    test_select.cpp
+    test_shuffle.cpp
+    test_sum.cpp
+    test_traits.cpp
+    test_trigonometric.cpp
+    test_xsimd_api.cpp
+    test_utils.hpp
 )
 
 if(NOT MSVC)
-    #list(APPEND XSIMD_TESTS test_gnu_source.cpp)
+    list(APPEND XSIMD_TESTS test_gnu_source.cpp)
 endif()
 
 add_executable(test_xsimd ${XSIMD_TESTS} ${XSIMD_HEADERS})

From c586776142ae035a68ae47ca882f608b206b062b Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Sun, 13 Jul 2025 15:54:47 +0200
Subject: [PATCH 05/57] WIP

---
 include/xsimd/arch/xsimd_altivec.hpp | 211 ++++++---------------------
 1 file changed, 41 insertions(+), 170 deletions(-)

diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp
index 23206ad5d..bbe87192d 100644
--- a/include/xsimd/arch/xsimd_altivec.hpp
+++ b/include/xsimd/arch/xsimd_altivec.hpp
@@ -65,59 +65,23 @@ namespace xsimd
         XSIMD_INLINE batch<T, A> avg(batch<T, A> const&, batch<T, A> const&, requires_arch<common>) noexcept;
         template <class A, class T>
         XSIMD_INLINE batch<T, A> avgr(batch<T, A> const&, batch<T, A> const&, requires_arch<common>) noexcept;
+#endif
 
         // abs
         template <class A>
-        XSIMD_INLINE batch<double, A> abs(batch<double, A> const& self, requires_arch<sse2>) noexcept
-        {
-            __m128d sign_mask = _mm_set1_pd(-0.f); // -0.f = 1 << 31
-            return _mm_andnot_pd(sign_mask, self);
-        }
-        template <class A>
-        XSIMD_INLINE batch<float, A> abs(batch<float, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<float, A> abs(batch<float, A> const& self, requires_arch<altivec>) noexcept
         {
-            __m128 sign_mask = _mm_set1_ps(-0.f); // -0.f = 1 << 31
-            return _mm_andnot_ps(sign_mask, self);
+            return vec_abs(self);
         }
 
         // add
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-            {
-                return _mm_add_epi8(self, other);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-            {
-                return _mm_add_epi16(self, other);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
-            {
-                return _mm_add_epi32(self, other);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
-            {
-                return _mm_add_epi64(self, other);
-            }
-            else
-            {
-                assert(false && "unsupported arch/op combination");
-                return {};
-            }
+            return vec_add(self, other);
         }
 
-        template <class A>
-        XSIMD_INLINE batch<float, A> add(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
-        {
-            return _mm_add_ps(self, other);
-        }
-
-        template <class A>
-        XSIMD_INLINE batch<double, A> add(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
-        {
-            return _mm_add_pd(self, other);
-        }
+#if 0
 
         // all
         template <class A>
@@ -923,40 +887,22 @@ namespace xsimd
             return _mm_cmpunord_pd(self, self);
         }
 
+#endif
         // load_aligned
-        template <class A>
-        XSIMD_INLINE batch<float, A> load_aligned(float const* mem, convert<float>, requires_arch<sse2>) noexcept
-        {
-            return _mm_load_ps(mem);
-        }
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE batch<T, A> load_aligned(T const* mem, convert<T>, requires_arch<sse2>) noexcept
+        template <class A, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> load_aligned(T const* mem, convert<T>, requires_arch<altivec>) noexcept
         {
-            return _mm_load_si128((__m128i const*)mem);
-        }
-        template <class A>
-        XSIMD_INLINE batch<double, A> load_aligned(double const* mem, convert<double>, requires_arch<sse2>) noexcept
-        {
-            return _mm_load_pd(mem);
+            return vec_ld(0, mem);
         }
 
         // load_unaligned
-        template <class A>
-        XSIMD_INLINE batch<float, A> load_unaligned(float const* mem, convert<float>, requires_arch<sse2>) noexcept
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<altivec>) noexcept
         {
-            return _mm_loadu_ps(mem);
-        }
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<sse2>) noexcept
-        {
-            return _mm_loadu_si128((__m128i const*)mem);
-        }
-        template <class A>
-        XSIMD_INLINE batch<double, A> load_unaligned(double const* mem, convert<double>, requires_arch<sse2>) noexcept
-        {
-            return _mm_loadu_pd(mem);
+            return *(typename batch<T, A>::register_type)mem;
         }
 
+#if 0
         // load_complex
         namespace detail
         {
@@ -972,6 +918,8 @@ namespace xsimd
                 return { _mm_shuffle_pd(hi, lo, _MM_SHUFFLE2(0, 0)), _mm_shuffle_pd(hi, lo, _MM_SHUFFLE2(1, 1)) };
             }
         }
+#endif
+#if 0
 
         // le
         template <class A>
@@ -1435,42 +1383,14 @@ namespace xsimd
             return _mm_srli_si128(x, N);
         }
 
+#endif
         // sadd
-
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            if (std::is_signed<T>::value)
-            {
-                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-                {
-                    return _mm_adds_epi8(self, other);
-                }
-                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-                {
-                    return _mm_adds_epi16(self, other);
-                }
-                else
-                {
-                    return sadd(self, other, common {});
-                }
-            }
-            else
-            {
-                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-                {
-                    return _mm_adds_epu8(self, other);
-                }
-                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-                {
-                    return _mm_adds_epu16(self, other);
-                }
-                else
-                {
-                    return sadd(self, other, common {});
-                }
-            }
+          return vec_adds(self, other);
         }
+#if 0
 
         // set
         template <class A, class... Values>
@@ -1527,88 +1447,39 @@ namespace xsimd
             static_assert(sizeof...(Values) == batch_bool<double, A>::size, "consistent init");
             return _mm_castsi128_pd(set(batch<int64_t, A>(), A {}, static_cast<int64_t>(values ? -1LL : 0LL)...).data);
         }
+#endif
 
         // ssub
 
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            if (std::is_signed<T>::value)
-            {
-                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-                {
-                    return _mm_subs_epi8(self, other);
-                }
-                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-                {
-                    return _mm_subs_epi16(self, other);
-                }
-                else
-                {
-                    return ssub(self, other, common {});
-                }
-            }
-            else
-            {
-                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-                {
-                    return _mm_subs_epu8(self, other);
-                }
-                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-                {
-                    return _mm_subs_epu16(self, other);
-                }
-                else
-                {
-                    return ssub(self, other, common {});
-                }
-            }
+          XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+          {
+              return vec_subs(self, other);
+          }
+          else
+          {
+              return ssub(self, other, common {});
+          }
         }
 
+
         // store_aligned
-        template <class A>
-        XSIMD_INLINE void store_aligned(float* mem, batch<float, A> const& self, requires_arch<sse2>) noexcept
-        {
-            return _mm_store_ps(mem, self);
-        }
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE void store_aligned(T* mem, batch<T, A> const& self, requires_arch<sse2>) noexcept
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        XSIMD_INLINE void store_aligned(T* mem, batch<T, A> const& self, requires_arch<altivec>) noexcept
         {
-            return _mm_store_si128((__m128i*)mem, self);
-        }
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE void store_aligned(T* mem, batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
-        {
-            return _mm_store_si128((__m128i*)mem, self);
-        }
-        template <class A>
-        XSIMD_INLINE void store_aligned(double* mem, batch<double, A> const& self, requires_arch<sse2>) noexcept
-        {
-            return _mm_store_pd(mem, self);
+            return vec_st(self, 0, mem);
         }
 
         // store_unaligned
-        template <class A>
-        XSIMD_INLINE void store_unaligned(float* mem, batch<float, A> const& self, requires_arch<sse2>) noexcept
-        {
-            return _mm_storeu_ps(mem, self);
-        }
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE void store_unaligned(T* mem, batch<T, A> const& self, requires_arch<sse2>) noexcept
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        XSIMD_INLINE void store_unaligned(T* mem, batch<T, A> const& self, requires_arch<altivec>) noexcept
         {
-            return _mm_storeu_si128((__m128i*)mem, self);
-        }
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE void store_unaligned(T* mem, batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
-        {
-            return _mm_storeu_si128((__m128i*)mem, self);
-        }
-        template <class A>
-        XSIMD_INLINE void store_unaligned(double* mem, batch<double, A> const& self, requires_arch<sse2>) noexcept
-        {
-            return _mm_storeu_pd(mem, self);
+            *(typename batch<T, A>::register_type)mem = self;
         }
 
+#if 0
         // sub
         template <class A>
         XSIMD_INLINE batch<float, A> sub(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept

From 9c0bd33a365fd75366f089f37bba1390673c9015 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Sun, 13 Jul 2025 16:16:51 +0200
Subject: [PATCH 06/57] WIP

---
 include/xsimd/arch/xsimd_altivec.hpp | 79 +++++++---------------------
 include/xsimd/config/xsimd_arch.hpp  |  4 +-
 include/xsimd/config/xsimd_cpuid.hpp |  5 ++
 3 files changed, 27 insertions(+), 61 deletions(-)

diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp
index bbe87192d..059eaa967 100644
--- a/include/xsimd/arch/xsimd_altivec.hpp
+++ b/include/xsimd/arch/xsimd_altivec.hpp
@@ -889,7 +889,7 @@ namespace xsimd
 
 #endif
         // load_aligned
-        template <class A, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> load_aligned(T const* mem, convert<T>, requires_arch<altivec>) noexcept
         {
             return vec_ld(0, mem);
@@ -1115,25 +1115,15 @@ namespace xsimd
         {
             return _mm_min_pd(self, other);
         }
+#endif
 
         // mul
-        template <class A>
-        XSIMD_INLINE batch<float, A> mul(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
-        {
-            return _mm_mul_ps(self, other);
-        }
-        template <class A>
-        XSIMD_INLINE batch<double, A> mul(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
-        {
-            return _mm_mul_pd(self, other);
-        }
-
-        // mul
-        template <class A>
-        XSIMD_INLINE batch<int16_t, A> mul(batch<int16_t, A> const& self, batch<int16_t, A> const& other, requires_arch<sse2>) noexcept
+        template <class A, class T, typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return _mm_mullo_epi16(self, other);
+            return vec_mul(self, other);
         }
+#if 0
 
         // nearbyint_as_int
         template <class A>
@@ -1388,7 +1378,7 @@ namespace xsimd
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-          return vec_adds(self, other);
+            return vec_adds(self, other);
         }
 #if 0
 
@@ -1454,17 +1444,16 @@ namespace xsimd
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-          XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-          {
-              return vec_subs(self, other);
-          }
-          else
-          {
-              return ssub(self, other, common {});
-          }
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return vec_subs(self, other);
+            }
+            else
+            {
+                return ssub(self, other, common {});
+            }
         }
 
-
         // store_aligned
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE void store_aligned(T* mem, batch<T, A> const& self, requires_arch<altivec>) noexcept
@@ -1479,44 +1468,14 @@ namespace xsimd
             *(typename batch<T, A>::register_type)mem = self;
         }
 
-#if 0
         // sub
-        template <class A>
-        XSIMD_INLINE batch<float, A> sub(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
-        {
-            return _mm_sub_ps(self, other);
-        }
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
-        {
-            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-            {
-                return _mm_sub_epi8(self, other);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-            {
-                return _mm_sub_epi16(self, other);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
-            {
-                return _mm_sub_epi32(self, other);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
-            {
-                return _mm_sub_epi64(self, other);
-            }
-            else
-            {
-                assert(false && "unsupported arch/op combination");
-                return {};
-            }
-        }
-        template <class A>
-        XSIMD_INLINE batch<double, A> sub(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return _mm_sub_pd(self, other);
+            return vec_sub(self, other);
         }
 
+#if 0
         // swizzle
 
         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
diff --git a/include/xsimd/config/xsimd_arch.hpp b/include/xsimd/config/xsimd_arch.hpp
index 89fc6783d..99a133ef0 100644
--- a/include/xsimd/config/xsimd_arch.hpp
+++ b/include/xsimd/config/xsimd_arch.hpp
@@ -169,14 +169,16 @@ namespace xsimd
     using all_sve_architectures = arch_list<detail::sve<512>, detail::sve<256>, detail::sve<128>>;
     using all_rvv_architectures = arch_list<detail::rvv<512>, detail::rvv<256>, detail::rvv<128>>;
     using all_arm_architectures = typename detail::join<all_sve_architectures, arch_list<i8mm<neon64>, neon64, neon>>::type;
+    using all_power_architectures = arch_list<altivec>;
     using all_riscv_architectures = all_rvv_architectures;
     using all_wasm_architectures = arch_list<wasm>;
-    using all_architectures = typename detail::join<all_riscv_architectures, all_wasm_architectures, all_arm_architectures, all_x86_architectures>::type;
+    using all_architectures = typename detail::join<all_power_architectures, all_riscv_architectures, all_wasm_architectures, all_arm_architectures, all_x86_architectures>::type;
 
     using supported_architectures = typename detail::supported<all_architectures>::type;
 
     using x86_arch = typename detail::supported<all_x86_architectures>::type::best;
     using arm_arch = typename detail::supported<all_arm_architectures>::type::best;
+    using power_arch = typename detail::supported<all_power_architectures>::type::best;
     using riscv_arch = typename detail::supported<all_riscv_architectures>::type::best;
     using best_arch = typename supported_architectures::best;
 
diff --git a/include/xsimd/config/xsimd_cpuid.hpp b/include/xsimd/config/xsimd_cpuid.hpp
index 7b940f655..fdd044f3d 100644
--- a/include/xsimd/config/xsimd_cpuid.hpp
+++ b/include/xsimd/config/xsimd_cpuid.hpp
@@ -84,6 +84,7 @@ namespace xsimd
             ARCH_FIELD_EX_REUSE(detail::rvv<256>, rvv)
             ARCH_FIELD_EX_REUSE(detail::rvv<128>, rvv)
             ARCH_FIELD(wasm)
+            ARCH_FIELD(altivec)
 
 #undef ARCH_FIELD
 
@@ -95,6 +96,10 @@ namespace xsimd
                 wasm = 1;
 #endif
 
+#if XSIMD_WITH_ALTIVEC
+                altivec = 1;
+#endif
+
 #if defined(__aarch64__) || defined(_M_ARM64)
                 neon = 1;
                 neon64 = 1;

From ad8177bd4dbfe4995009840498184b2f517f73ec Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Sun, 13 Jul 2025 16:47:39 +0200
Subject: [PATCH 07/57] WIP

---
 include/xsimd/arch/xsimd_altivec.hpp | 400 +++++++++++----------------
 1 file changed, 165 insertions(+), 235 deletions(-)

diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp
index 059eaa967..997fb5944 100644
--- a/include/xsimd/arch/xsimd_altivec.hpp
+++ b/include/xsimd/arch/xsimd_altivec.hpp
@@ -85,41 +85,41 @@ namespace xsimd
 
         // all
         template <class A>
-        XSIMD_INLINE bool all(batch_bool<float, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE bool all(batch_bool<float, A> const& self, requires_arch<altivec>) noexcept
         {
             return _mm_movemask_ps(self) == 0x0F;
         }
         template <class A>
-        XSIMD_INLINE bool all(batch_bool<double, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE bool all(batch_bool<double, A> const& self, requires_arch<altivec>) noexcept
         {
             return _mm_movemask_pd(self) == 0x03;
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE bool all(batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE bool all(batch_bool<T, A> const& self, requires_arch<altivec>) noexcept
         {
             return _mm_movemask_epi8(self) == 0xFFFF;
         }
 
         // any
         template <class A>
-        XSIMD_INLINE bool any(batch_bool<float, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE bool any(batch_bool<float, A> const& self, requires_arch<altivec>) noexcept
         {
             return _mm_movemask_ps(self) != 0;
         }
         template <class A>
-        XSIMD_INLINE bool any(batch_bool<double, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE bool any(batch_bool<double, A> const& self, requires_arch<altivec>) noexcept
         {
             return _mm_movemask_pd(self) != 0;
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE bool any(batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE bool any(batch_bool<T, A> const& self, requires_arch<altivec>) noexcept
         {
             return _mm_movemask_epi8(self) != 0;
         }
 
         // avgr
         template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type>
-        XSIMD_INLINE batch<T, A> avgr(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<T, A> avgr(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
             {
@@ -137,7 +137,7 @@ namespace xsimd
 
         // avg
         template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type>
-        XSIMD_INLINE batch<T, A> avg(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<T, A> avg(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
             {
@@ -154,86 +154,43 @@ namespace xsimd
                 return avg(self, other, common {});
             }
         }
+#endif
 
         // batch_bool_cast
         template <class A, class T_out, class T_in>
-        XSIMD_INLINE batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<altivec>) noexcept
         {
             return { bitwise_cast<T_out>(batch<T_in, A>(self.data)).data };
         }
 
         // bitwise_and
-        template <class A>
-        XSIMD_INLINE batch<float, A> bitwise_and(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
-        {
-            return _mm_and_ps(self, other);
-        }
-        template <class A>
-        XSIMD_INLINE batch_bool<float, A> bitwise_and(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
-        {
-            return _mm_and_ps(self, other);
-        }
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
-        {
-            return _mm_and_si128(self, other);
-        }
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
-        {
-            return _mm_and_si128(self, other);
-        }
-
-        template <class A>
-        batch<double, A> XSIMD_INLINE bitwise_and(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return _mm_and_pd(self, other);
+            return vec_and(self, other);
         }
-
-        template <class A>
-        XSIMD_INLINE batch_bool<double, A> bitwise_and(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return _mm_and_pd(self, other);
+            return vec_and(self, other);
         }
 
         // bitwise_andnot
-        template <class A>
-        XSIMD_INLINE batch<float, A> bitwise_andnot(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
-        {
-            return _mm_andnot_ps(other, self);
-        }
-
-        template <class A>
-        XSIMD_INLINE batch_bool<float, A> bitwise_andnot(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
-        {
-            return _mm_andnot_ps(other, self);
-        }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return _mm_andnot_si128(other, self);
+            return vec_andc(other, self);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
-        {
-            return _mm_andnot_si128(other, self);
-        }
-
-        template <class A>
-        XSIMD_INLINE batch<double, A> bitwise_andnot(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return _mm_andnot_pd(other, self);
-        }
-
-        template <class A>
-        XSIMD_INLINE batch_bool<double, A> bitwise_andnot(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
-        {
-            return _mm_andnot_pd(other, self);
+            return vec_andc(other, self);
         }
 
+#if 0
         // bitwise_lshift
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<altivec>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
             {
@@ -260,73 +217,73 @@ namespace xsimd
 
         // bitwise_not
         template <class A>
-        XSIMD_INLINE batch<float, A> bitwise_not(batch<float, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<float, A> bitwise_not(batch<float, A> const& self, requires_arch<altivec>) noexcept
         {
             return _mm_xor_ps(self, _mm_castsi128_ps(_mm_set1_epi32(-1)));
         }
         template <class A>
-        XSIMD_INLINE batch_bool<float, A> bitwise_not(batch_bool<float, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<float, A> bitwise_not(batch_bool<float, A> const& self, requires_arch<altivec>) noexcept
         {
             return _mm_xor_ps(self, _mm_castsi128_ps(_mm_set1_epi32(-1)));
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<altivec>) noexcept
         {
             return _mm_xor_si128(self, _mm_set1_epi32(-1));
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<altivec>) noexcept
         {
             return _mm_xor_si128(self, _mm_set1_epi32(-1));
         }
         template <class A>
-        XSIMD_INLINE batch<double, A> bitwise_not(batch<double, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<double, A> bitwise_not(batch<double, A> const& self, requires_arch<altivec>) noexcept
         {
             return _mm_xor_pd(self, _mm_castsi128_pd(_mm_set1_epi32(-1)));
         }
         template <class A>
-        XSIMD_INLINE batch_bool<double, A> bitwise_not(batch_bool<double, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<double, A> bitwise_not(batch_bool<double, A> const& self, requires_arch<altivec>) noexcept
         {
             return _mm_xor_pd(self, _mm_castsi128_pd(_mm_set1_epi32(-1)));
         }
 
         // bitwise_or
         template <class A>
-        XSIMD_INLINE batch<float, A> bitwise_or(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<float, A> bitwise_or(batch<float, A> const& self, batch<float, A> const& other, requires_arch<altivec>) noexcept
         {
             return _mm_or_ps(self, other);
         }
         template <class A>
-        XSIMD_INLINE batch_bool<float, A> bitwise_or(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<float, A> bitwise_or(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<altivec>) noexcept
         {
             return _mm_or_ps(self, other);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
             return _mm_or_si128(self, other);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<altivec>) noexcept
         {
             return _mm_or_si128(self, other);
         }
 
         template <class A>
-        XSIMD_INLINE batch<double, A> bitwise_or(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<double, A> bitwise_or(batch<double, A> const& self, batch<double, A> const& other, requires_arch<altivec>) noexcept
         {
             return _mm_or_pd(self, other);
         }
 
         template <class A>
-        XSIMD_INLINE batch_bool<double, A> bitwise_or(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<double, A> bitwise_or(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<altivec>) noexcept
         {
             return _mm_or_pd(self, other);
         }
 
         // bitwise_rshift
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<altivec>) noexcept
         {
             if (std::is_signed<T>::value)
             {
@@ -388,81 +345,54 @@ namespace xsimd
 
         // bitwise_xor
         template <class A>
-        XSIMD_INLINE batch<float, A> bitwise_xor(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<float, A> bitwise_xor(batch<float, A> const& self, batch<float, A> const& other, requires_arch<altivec>) noexcept
         {
             return _mm_xor_ps(self, other);
         }
         template <class A>
-        XSIMD_INLINE batch_bool<float, A> bitwise_xor(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<float, A> bitwise_xor(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<altivec>) noexcept
         {
             return _mm_xor_ps(self, other);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
             return _mm_xor_si128(self, other);
         }
         template <class A>
-        XSIMD_INLINE batch<double, A> bitwise_xor(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<double, A> bitwise_xor(batch<double, A> const& self, batch<double, A> const& other, requires_arch<altivec>) noexcept
         {
             return _mm_xor_pd(self, other);
         }
         template <class A>
-        XSIMD_INLINE batch_bool<double, A> bitwise_xor(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<double, A> bitwise_xor(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<altivec>) noexcept
         {
             return _mm_xor_pd(self, other);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE batch<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<altivec>) noexcept
         {
             return _mm_xor_si128(self, other);
         }
+#endif
 
         // bitwise_cast
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE batch<float, A> bitwise_cast(batch<T, A> const& self, batch<float, A> const&, requires_arch<sse2>) noexcept
-        {
-            return _mm_castsi128_ps(self);
-        }
-        template <class A, class T, class Tp, class = typename std::enable_if<std::is_integral<typename std::common_type<T, Tp>::type>::value, void>::type>
-        XSIMD_INLINE batch<Tp, A> bitwise_cast(batch<T, A> const& self, batch<Tp, A> const&, requires_arch<sse2>) noexcept
-        {
-            return batch<Tp, A>(self.data);
-        }
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE batch<T, A> bitwise_cast(batch<float, A> const& self, batch<T, A> const&, requires_arch<sse2>) noexcept
+        template <class A, class T_in, class T_out>
+        XSIMD_INLINE batch<T_out, A> bitwise_cast(batch<T_in, A> const& self, batch<T_out, A> const&, requires_arch<altivec>) noexcept
         {
-            return _mm_castps_si128(self);
-        }
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE batch<double, A> bitwise_cast(batch<T, A> const& self, batch<double, A> const&, requires_arch<sse2>) noexcept
-        {
-            return _mm_castsi128_pd(self);
-        }
-        template <class A>
-        XSIMD_INLINE batch<double, A> bitwise_cast(batch<float, A> const& self, batch<double, A> const&, requires_arch<sse2>) noexcept
-        {
-            return _mm_castps_pd(self);
-        }
-        template <class A>
-        XSIMD_INLINE batch<float, A> bitwise_cast(batch<double, A> const& self, batch<float, A> const&, requires_arch<sse2>) noexcept
-        {
-            return _mm_castpd_ps(self);
-        }
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE batch<T, A> bitwise_cast(batch<double, A> const& self, batch<T, A> const&, requires_arch<sse2>) noexcept
-        {
-            return _mm_castpd_si128(self);
+            return *reinterpret_cast<typename batch<T_out, A>::register_type const*>(&self.data);
         }
 
+#if 0
+
         // broadcast
         template <class A>
-        batch<float, A> XSIMD_INLINE broadcast(float val, requires_arch<sse2>) noexcept
+        batch<float, A> XSIMD_INLINE broadcast(float val, requires_arch<altivec>) noexcept
         {
             return _mm_set1_ps(val);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<altivec>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
             {
@@ -487,7 +417,7 @@ namespace xsimd
             }
         }
         template <class A>
-        XSIMD_INLINE batch<double, A> broadcast(double val, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<double, A> broadcast(double val, requires_arch<altivec>) noexcept
         {
             return _mm_set1_pd(val);
         }
@@ -498,23 +428,23 @@ namespace xsimd
             // Override these methods in SSE-based archs, no need to override store_aligned / store_unaligned
             // complex_low
             template <class A>
-            XSIMD_INLINE batch<float, A> complex_low(batch<std::complex<float>, A> const& self, requires_arch<sse2>) noexcept
+            XSIMD_INLINE batch<float, A> complex_low(batch<std::complex<float>, A> const& self, requires_arch<altivec>) noexcept
             {
                 return _mm_unpacklo_ps(self.real(), self.imag());
             }
             // complex_high
             template <class A>
-            XSIMD_INLINE batch<float, A> complex_high(batch<std::complex<float>, A> const& self, requires_arch<sse2>) noexcept
+            XSIMD_INLINE batch<float, A> complex_high(batch<std::complex<float>, A> const& self, requires_arch<altivec>) noexcept
             {
                 return _mm_unpackhi_ps(self.real(), self.imag());
             }
             template <class A>
-            XSIMD_INLINE batch<double, A> complex_low(batch<std::complex<double>, A> const& self, requires_arch<sse2>) noexcept
+            XSIMD_INLINE batch<double, A> complex_low(batch<std::complex<double>, A> const& self, requires_arch<altivec>) noexcept
             {
                 return _mm_unpacklo_pd(self.real(), self.imag());
             }
             template <class A>
-            XSIMD_INLINE batch<double, A> complex_high(batch<std::complex<double>, A> const& self, requires_arch<sse2>) noexcept
+            XSIMD_INLINE batch<double, A> complex_high(batch<std::complex<double>, A> const& self, requires_arch<altivec>) noexcept
             {
                 return _mm_unpackhi_pd(self.real(), self.imag());
             }
@@ -522,19 +452,19 @@ namespace xsimd
 
         // decr_if
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE batch<T, A> decr_if(batch<T, A> const& self, batch_bool<T, A> const& mask, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<T, A> decr_if(batch<T, A> const& self, batch_bool<T, A> const& mask, requires_arch<altivec>) noexcept
         {
             return self + batch<T, A>(mask.data);
         }
 
         // div
         template <class A>
-        XSIMD_INLINE batch<float, A> div(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<float, A> div(batch<float, A> const& self, batch<float, A> const& other, requires_arch<altivec>) noexcept
         {
             return _mm_div_ps(self, other);
         }
         template <class A>
-        XSIMD_INLINE batch<double, A> div(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<double, A> div(batch<double, A> const& self, batch<double, A> const& other, requires_arch<altivec>) noexcept
         {
             return _mm_div_pd(self, other);
         }
@@ -543,16 +473,16 @@ namespace xsimd
         namespace detail
         {
             template <class A>
-            XSIMD_INLINE batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&, requires_arch<sse2>) noexcept
+            XSIMD_INLINE batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&, requires_arch<altivec>) noexcept
             {
                 return _mm_cvtepi32_ps(self);
             }
 
             template <class A>
-            XSIMD_INLINE batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<sse2>) noexcept
+            XSIMD_INLINE batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<altivec>) noexcept
             {
                 // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
-                // adapted to sse2
+                // adapted to altivec
                 __m128i xH = _mm_srli_epi64(x, 32);
                 xH = _mm_or_si128(xH, _mm_castpd_si128(_mm_set1_pd(19342813113834066795298816.))); //  2^84
                 __m128i mask = _mm_setr_epi16(0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000);
@@ -562,10 +492,10 @@ namespace xsimd
             }
 
             template <class A>
-            XSIMD_INLINE batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<sse2>) noexcept
+            XSIMD_INLINE batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<altivec>) noexcept
             {
                 // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
-                // adapted to sse2
+                // adapted to altivec
                 __m128i xH = _mm_srai_epi32(x, 16);
                 xH = _mm_and_si128(xH, _mm_setr_epi16(0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF));
                 xH = _mm_add_epi64(xH, _mm_castpd_si128(_mm_set1_pd(442721857769029238784.))); //  3*2^67
@@ -576,7 +506,7 @@ namespace xsimd
             }
 
             template <class A>
-            XSIMD_INLINE batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<sse2>) noexcept
+            XSIMD_INLINE batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<altivec>) noexcept
             {
                 return _mm_cvttps_epi32(self);
             }
@@ -584,17 +514,17 @@ namespace xsimd
 
         // eq
         template <class A>
-        XSIMD_INLINE batch_bool<float, A> eq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<float, A> eq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<altivec>) noexcept
         {
             return _mm_cmpeq_ps(self, other);
         }
         template <class A>
-        XSIMD_INLINE batch_bool<float, A> eq(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<float, A> eq(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<altivec>) noexcept
         {
             return _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(self), _mm_castps_si128(other)));
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
             {
@@ -623,36 +553,36 @@ namespace xsimd
             }
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<altivec>) noexcept
         {
             return ~(self != other);
         }
         template <class A>
-        XSIMD_INLINE batch_bool<double, A> eq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<double, A> eq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<altivec>) noexcept
         {
             return _mm_cmpeq_pd(self, other);
         }
         template <class A>
-        XSIMD_INLINE batch_bool<double, A> eq(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<double, A> eq(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<altivec>) noexcept
         {
             return _mm_castsi128_pd(_mm_cmpeq_epi32(_mm_castpd_si128(self), _mm_castpd_si128(other)));
         }
 
         // first
         template <class A>
-        XSIMD_INLINE float first(batch<float, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE float first(batch<float, A> const& self, requires_arch<altivec>) noexcept
         {
             return _mm_cvtss_f32(self);
         }
 
         template <class A>
-        XSIMD_INLINE double first(batch<double, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE double first(batch<double, A> const& self, requires_arch<altivec>) noexcept
         {
             return _mm_cvtsd_f64(self);
         }
 
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE T first(batch<T, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE T first(batch<T, A> const& self, requires_arch<altivec>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
             {
@@ -687,7 +617,7 @@ namespace xsimd
 
         // from_mask
         template <class A>
-        XSIMD_INLINE batch_bool<float, A> from_mask(batch_bool<float, A> const&, uint64_t mask, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<float, A> from_mask(batch_bool<float, A> const&, uint64_t mask, requires_arch<altivec>) noexcept
         {
             alignas(A::alignment()) static const uint32_t lut[][4] = {
                 { 0x00000000, 0x00000000, 0x00000000, 0x00000000 },
@@ -711,7 +641,7 @@ namespace xsimd
             return _mm_castsi128_ps(_mm_load_si128((const __m128i*)lut[mask]));
         }
         template <class A>
-        XSIMD_INLINE batch_bool<double, A> from_mask(batch_bool<double, A> const&, uint64_t mask, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<double, A> from_mask(batch_bool<double, A> const&, uint64_t mask, requires_arch<altivec>) noexcept
         {
             alignas(A::alignment()) static const uint64_t lut[][4] = {
                 { 0x0000000000000000ul, 0x0000000000000000ul },
@@ -723,7 +653,7 @@ namespace xsimd
             return _mm_castsi128_pd(_mm_load_si128((const __m128i*)lut[mask]));
         }
         template <class T, class A, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE batch_bool<T, A> from_mask(batch_bool<T, A> const&, uint64_t mask, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<T, A> from_mask(batch_bool<T, A> const&, uint64_t mask, requires_arch<altivec>) noexcept
         {
             alignas(A::alignment()) static const uint64_t lut64[] = {
                 0x0000000000000000,
@@ -773,34 +703,34 @@ namespace xsimd
             }
             else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
             {
-                return _mm_castps_si128(from_mask(batch_bool<float, A> {}, mask, sse2 {}));
+                return _mm_castps_si128(from_mask(batch_bool<float, A> {}, mask, altivec {}));
             }
             else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
             {
-                return _mm_castpd_si128(from_mask(batch_bool<double, A> {}, mask, sse2 {}));
+                return _mm_castpd_si128(from_mask(batch_bool<double, A> {}, mask, altivec {}));
             }
         }
 
         // ge
         template <class A>
-        XSIMD_INLINE batch_bool<float, A> ge(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<float, A> ge(batch<float, A> const& self, batch<float, A> const& other, requires_arch<altivec>) noexcept
         {
             return _mm_cmpge_ps(self, other);
         }
         template <class A>
-        XSIMD_INLINE batch_bool<double, A> ge(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<double, A> ge(batch<double, A> const& self, batch<double, A> const& other, requires_arch<altivec>) noexcept
         {
             return _mm_cmpge_pd(self, other);
         }
 
         // gt
         template <class A>
-        XSIMD_INLINE batch_bool<float, A> gt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<float, A> gt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<altivec>) noexcept
         {
             return _mm_cmpgt_ps(self, other);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
             if (std::is_signed<T>::value)
             {
@@ -828,14 +758,14 @@ namespace xsimd
         }
 
         template <class A>
-        XSIMD_INLINE batch_bool<double, A> gt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<double, A> gt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<altivec>) noexcept
         {
             return _mm_cmpgt_pd(self, other);
         }
 
         // haddp
         template <class A>
-        XSIMD_INLINE batch<float, A> haddp(batch<float, A> const* row, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<float, A> haddp(batch<float, A> const* row, requires_arch<altivec>) noexcept
         {
             __m128 tmp0 = _mm_unpacklo_ps(row[0], row[1]);
             __m128 tmp1 = _mm_unpackhi_ps(row[0], row[1]);
@@ -848,7 +778,7 @@ namespace xsimd
             return _mm_add_ps(tmp0, tmp2);
         }
         template <class A>
-        XSIMD_INLINE batch<double, A> haddp(batch<double, A> const* row, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<double, A> haddp(batch<double, A> const* row, requires_arch<altivec>) noexcept
         {
             return _mm_add_pd(_mm_unpacklo_pd(row[0], row[1]),
                               _mm_unpackhi_pd(row[0], row[1]));
@@ -856,14 +786,14 @@ namespace xsimd
 
         // incr_if
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE batch<T, A> incr_if(batch<T, A> const& self, batch_bool<T, A> const& mask, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<T, A> incr_if(batch<T, A> const& self, batch_bool<T, A> const& mask, requires_arch<altivec>) noexcept
         {
             return self - batch<T, A>(mask.data);
         }
 
         // insert
         template <class A, class T, size_t I, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I> pos, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I> pos, requires_arch<altivec>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
             {
@@ -877,12 +807,12 @@ namespace xsimd
 
         // isnan
         template <class A>
-        XSIMD_INLINE batch_bool<float, A> isnan(batch<float, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<float, A> isnan(batch<float, A> const& self, requires_arch<altivec>) noexcept
         {
             return _mm_cmpunord_ps(self, self);
         }
         template <class A>
-        XSIMD_INLINE batch_bool<double, A> isnan(batch<double, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<double, A> isnan(batch<double, A> const& self, requires_arch<altivec>) noexcept
         {
             return _mm_cmpunord_pd(self, self);
         }
@@ -908,12 +838,12 @@ namespace xsimd
         {
             // Redefine these methods in the SSE-based archs if required
             template <class A>
-            XSIMD_INLINE batch<std::complex<float>, A> load_complex(batch<float, A> const& hi, batch<float, A> const& lo, requires_arch<sse2>) noexcept
+            XSIMD_INLINE batch<std::complex<float>, A> load_complex(batch<float, A> const& hi, batch<float, A> const& lo, requires_arch<altivec>) noexcept
             {
                 return { _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(2, 0, 2, 0)), _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(3, 1, 3, 1)) };
             }
             template <class A>
-            XSIMD_INLINE batch<std::complex<double>, A> load_complex(batch<double, A> const& hi, batch<double, A> const& lo, requires_arch<sse2>) noexcept
+            XSIMD_INLINE batch<std::complex<double>, A> load_complex(batch<double, A> const& hi, batch<double, A> const& lo, requires_arch<altivec>) noexcept
             {
                 return { _mm_shuffle_pd(hi, lo, _MM_SHUFFLE2(0, 0)), _mm_shuffle_pd(hi, lo, _MM_SHUFFLE2(1, 1)) };
             }
@@ -923,24 +853,24 @@ namespace xsimd
 
         // le
         template <class A>
-        XSIMD_INLINE batch_bool<float, A> le(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<float, A> le(batch<float, A> const& self, batch<float, A> const& other, requires_arch<altivec>) noexcept
         {
             return _mm_cmple_ps(self, other);
         }
         template <class A>
-        XSIMD_INLINE batch_bool<double, A> le(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<double, A> le(batch<double, A> const& self, batch<double, A> const& other, requires_arch<altivec>) noexcept
         {
             return _mm_cmple_pd(self, other);
         }
 
         // lt
         template <class A>
-        XSIMD_INLINE batch_bool<float, A> lt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<float, A> lt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<altivec>) noexcept
         {
             return _mm_cmplt_ps(self, other);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
             if (std::is_signed<T>::value)
             {
@@ -1007,7 +937,7 @@ namespace xsimd
         }
 
         template <class A>
-        XSIMD_INLINE batch_bool<double, A> lt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<double, A> lt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<altivec>) noexcept
         {
             return _mm_cmplt_pd(self, other);
         }
@@ -1045,7 +975,7 @@ namespace xsimd
 
         // mask
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE uint64_t mask(batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE uint64_t mask(batch_bool<T, A> const& self, requires_arch<altivec>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
             {
@@ -1071,47 +1001,47 @@ namespace xsimd
             }
         }
         template <class A>
-        XSIMD_INLINE uint64_t mask(batch_bool<float, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE uint64_t mask(batch_bool<float, A> const& self, requires_arch<altivec>) noexcept
         {
             return _mm_movemask_ps(self);
         }
 
         template <class A>
-        XSIMD_INLINE uint64_t mask(batch_bool<double, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE uint64_t mask(batch_bool<double, A> const& self, requires_arch<altivec>) noexcept
         {
             return _mm_movemask_pd(self);
         }
 
         // max
         template <class A>
-        XSIMD_INLINE batch<float, A> max(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<float, A> max(batch<float, A> const& self, batch<float, A> const& other, requires_arch<altivec>) noexcept
         {
             return _mm_max_ps(self, other);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
             return select(self > other, self, other);
         }
         template <class A>
-        XSIMD_INLINE batch<double, A> max(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<double, A> max(batch<double, A> const& self, batch<double, A> const& other, requires_arch<altivec>) noexcept
         {
             return _mm_max_pd(self, other);
         }
 
         // min
         template <class A>
-        XSIMD_INLINE batch<float, A> min(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<float, A> min(batch<float, A> const& self, batch<float, A> const& other, requires_arch<altivec>) noexcept
         {
             return _mm_min_ps(self, other);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
             return select(self <= other, self, other);
         }
         template <class A>
-        XSIMD_INLINE batch<double, A> min(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<double, A> min(batch<double, A> const& self, batch<double, A> const& other, requires_arch<altivec>) noexcept
         {
             return _mm_min_pd(self, other);
         }
@@ -1128,25 +1058,25 @@ namespace xsimd
         // nearbyint_as_int
         template <class A>
         XSIMD_INLINE batch<int32_t, A> nearbyint_as_int(batch<float, A> const& self,
-                                                        requires_arch<sse2>) noexcept
+                                                        requires_arch<altivec>) noexcept
         {
             return _mm_cvtps_epi32(self);
         }
 
         // neg
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& self, requires_arch<altivec>) noexcept
         {
             return 0 - self;
         }
         template <class A>
-        XSIMD_INLINE batch<float, A> neg(batch<float, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<float, A> neg(batch<float, A> const& self, requires_arch<altivec>) noexcept
         {
             return _mm_xor_ps(self, _mm_castsi128_ps(_mm_set1_epi32(0x80000000)));
         }
 
         template <class A>
-        XSIMD_INLINE batch<double, A> neg(batch<double, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<double, A> neg(batch<double, A> const& self, requires_arch<altivec>) noexcept
         {
             return _mm_xor_pd(
                 self, _mm_castsi128_pd(_mm_setr_epi32(0, 0x80000000, 0, 0x80000000)));
@@ -1154,33 +1084,33 @@ namespace xsimd
 
         // neq
         template <class A>
-        XSIMD_INLINE batch_bool<float, A> neq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<float, A> neq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<altivec>) noexcept
         {
             return _mm_cmpneq_ps(self, other);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
             return ~(self == other);
         }
         template <class A>
-        XSIMD_INLINE batch_bool<float, A> neq(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<float, A> neq(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<altivec>) noexcept
         {
             return _mm_xor_ps(self, other);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE batch_bool<T, A> neq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<T, A> neq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<altivec>) noexcept
         {
             return _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(self.data), _mm_castsi128_ps(other.data)));
         }
 
         template <class A>
-        XSIMD_INLINE batch_bool<double, A> neq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<double, A> neq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<altivec>) noexcept
         {
             return _mm_cmpneq_pd(self, other);
         }
         template <class A>
-        XSIMD_INLINE batch_bool<double, A> neq(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch_bool<double, A> neq(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<altivec>) noexcept
         {
             return _mm_xor_pd(self, other);
         }
@@ -1188,14 +1118,14 @@ namespace xsimd
         // reciprocal
         template <class A>
         XSIMD_INLINE batch<float, A> reciprocal(batch<float, A> const& self,
-                                                kernel::requires_arch<sse2>)
+                                                kernel::requires_arch<altivec>)
         {
             return _mm_rcp_ps(self);
         }
 
         // reduce_add
         template <class A>
-        XSIMD_INLINE float reduce_add(batch<float, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE float reduce_add(batch<float, A> const& self, requires_arch<altivec>) noexcept
         {
             __m128 tmp0 = _mm_add_ps(self, _mm_movehl_ps(self, self));
             __m128 tmp1 = _mm_add_ss(tmp0, _mm_shuffle_ps(tmp0, tmp0, 1));
@@ -1203,7 +1133,7 @@ namespace xsimd
         }
 
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<altivec>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
             {
@@ -1234,14 +1164,14 @@ namespace xsimd
         }
 
         template <class A>
-        XSIMD_INLINE double reduce_add(batch<double, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE double reduce_add(batch<double, A> const& self, requires_arch<altivec>) noexcept
         {
             return _mm_cvtsd_f64(_mm_add_sd(self, _mm_unpackhi_pd(self, self)));
         }
 
         // reduce_max
         template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
-        XSIMD_INLINE T reduce_max(batch<T, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE T reduce_max(batch<T, A> const& self, requires_arch<altivec>) noexcept
         {
             constexpr auto mask0 = detail::shuffle(2, 3, 0, 0);
             batch<T, A> step0 = _mm_shuffle_epi32(self, mask0);
@@ -1263,7 +1193,7 @@ namespace xsimd
 
         // reduce_min
         template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
-        XSIMD_INLINE T reduce_min(batch<T, A> const& self, requires_arch<sse2>) noexcept
+        XSIMD_INLINE T reduce_min(batch<T, A> const& self, requires_arch<altivec>) noexcept
         {
             constexpr auto mask0 = detail::shuffle(2, 3, 0, 0);
             batch<T, A> step0 = _mm_shuffle_epi32(self, mask0);
@@ -1285,42 +1215,42 @@ namespace xsimd
 
         // rsqrt
         template <class A>
-        XSIMD_INLINE batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<altivec>) noexcept
         {
             return _mm_rsqrt_ps(val);
         }
         template <class A>
-        XSIMD_INLINE batch<double, A> rsqrt(batch<double, A> const& val, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<double, A> rsqrt(batch<double, A> const& val, requires_arch<altivec>) noexcept
         {
             return _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(val)));
         }
 
         // select
         template <class A>
-        XSIMD_INLINE batch<float, A> select(batch_bool<float, A> const& cond, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<float, A> select(batch_bool<float, A> const& cond, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<altivec>) noexcept
         {
             return _mm_or_ps(_mm_and_ps(cond, true_br), _mm_andnot_ps(cond, false_br));
         }
 
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<altivec>) noexcept
         {
             return _mm_or_si128(_mm_and_si128(cond, true_br), _mm_andnot_si128(cond, false_br));
         }
         template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<altivec>) noexcept
         {
-            return select(batch_bool<T, A> { Values... }, true_br, false_br, sse2 {});
+            return select(batch_bool<T, A> { Values... }, true_br, false_br, altivec {});
         }
         template <class A>
-        XSIMD_INLINE batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<altivec>) noexcept
         {
             return _mm_or_pd(_mm_and_pd(cond, true_br), _mm_andnot_pd(cond, false_br));
         }
 
         // shuffle
         template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3>
-        XSIMD_INLINE batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y, batch_constant<ITy, A, I0, I1, I2, I3> mask, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y, batch_constant<ITy, A, I0, I1, I2, I3> mask, requires_arch<altivec>) noexcept
         {
             constexpr uint32_t smask = detail::mod_shuffle(I0, I1, I2, I3);
             // shuffle within lane
@@ -1334,7 +1264,7 @@ namespace xsimd
         }
 
         template <class A, class ITy, ITy I0, ITy I1>
-        XSIMD_INLINE batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<ITy, A, I0, I1> mask, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<ITy, A, I0, I1> mask, requires_arch<altivec>) noexcept
         {
             constexpr uint32_t smask = detail::mod_shuffle(I0, I1);
             // shuffle within lane
@@ -1349,26 +1279,26 @@ namespace xsimd
 
         // sqrt
         template <class A>
-        XSIMD_INLINE batch<float, A> sqrt(batch<float, A> const& val, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<float, A> sqrt(batch<float, A> const& val, requires_arch<altivec>) noexcept
         {
             return _mm_sqrt_ps(val);
         }
         template <class A>
-        XSIMD_INLINE batch<double, A> sqrt(batch<double, A> const& val, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<double, A> sqrt(batch<double, A> const& val, requires_arch<altivec>) noexcept
         {
             return _mm_sqrt_pd(val);
         }
 
         // slide_left
         template <size_t N, class A, class T>
-        XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& x, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& x, requires_arch<altivec>) noexcept
         {
             return _mm_slli_si128(x, N);
         }
 
         // slide_right
         template <size_t N, class A, class T>
-        XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& x, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& x, requires_arch<altivec>) noexcept
         {
             return _mm_srli_si128(x, N);
         }
@@ -1384,55 +1314,55 @@ namespace xsimd
 
         // set
         template <class A, class... Values>
-        XSIMD_INLINE batch<float, A> set(batch<float, A> const&, requires_arch<sse2>, Values... values) noexcept
+        XSIMD_INLINE batch<float, A> set(batch<float, A> const&, requires_arch<altivec>, Values... values) noexcept
         {
             static_assert(sizeof...(Values) == batch<float, A>::size, "consistent init");
             return _mm_setr_ps(values...);
         }
 
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<sse2>, T v0, T v1) noexcept
+        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<altivec>, T v0, T v1) noexcept
         {
             return _mm_set_epi64x(v1, v0);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<sse2>, T v0, T v1, T v2, T v3) noexcept
+        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<altivec>, T v0, T v1, T v2, T v3) noexcept
         {
             return _mm_setr_epi32(v0, v1, v2, v3);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<sse2>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) noexcept
+        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<altivec>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) noexcept
         {
             return _mm_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<sse2>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) noexcept
+        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<altivec>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) noexcept
         {
             return _mm_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
         }
 
         template <class A, class... Values>
-        XSIMD_INLINE batch<double, A> set(batch<double, A> const&, requires_arch<sse2>, Values... values) noexcept
+        XSIMD_INLINE batch<double, A> set(batch<double, A> const&, requires_arch<altivec>, Values... values) noexcept
         {
             static_assert(sizeof...(Values) == batch<double, A>::size, "consistent init");
             return _mm_setr_pd(values...);
         }
 
         template <class A, class T, class... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<sse2>, Values... values) noexcept
+        XSIMD_INLINE batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<altivec>, Values... values) noexcept
         {
             return set(batch<T, A>(), A {}, static_cast<T>(values ? -1LL : 0LL)...).data;
         }
 
         template <class A, class... Values>
-        XSIMD_INLINE batch_bool<float, A> set(batch_bool<float, A> const&, requires_arch<sse2>, Values... values) noexcept
+        XSIMD_INLINE batch_bool<float, A> set(batch_bool<float, A> const&, requires_arch<altivec>, Values... values) noexcept
         {
             static_assert(sizeof...(Values) == batch_bool<float, A>::size, "consistent init");
             return _mm_castsi128_ps(set(batch<int32_t, A>(), A {}, static_cast<int32_t>(values ? -1LL : 0LL)...).data);
         }
 
         template <class A, class... Values>
-        XSIMD_INLINE batch_bool<double, A> set(batch_bool<double, A> const&, requires_arch<sse2>, Values... values) noexcept
+        XSIMD_INLINE batch_bool<double, A> set(batch_bool<double, A> const&, requires_arch<altivec>, Values... values) noexcept
         {
             static_assert(sizeof...(Values) == batch_bool<double, A>::size, "consistent init");
             return _mm_castsi128_pd(set(batch<int64_t, A>(), A {}, static_cast<int64_t>(values ? -1LL : 0LL)...).data);
@@ -1479,47 +1409,47 @@ namespace xsimd
         // swizzle
 
         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
-        XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3>, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3>, requires_arch<altivec>) noexcept
         {
             constexpr uint32_t index = detail::shuffle(V0, V1, V2, V3);
             return _mm_shuffle_ps(self, self, index);
         }
 
         template <class A, uint64_t V0, uint64_t V1>
-        XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1>, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1>, requires_arch<altivec>) noexcept
         {
             constexpr uint32_t index = detail::shuffle(V0, V1);
             return _mm_shuffle_pd(self, self, index);
         }
 
         template <class A, uint64_t V0, uint64_t V1>
-        XSIMD_INLINE batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<uint64_t, A, V0, V1>, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<uint64_t, A, V0, V1>, requires_arch<altivec>) noexcept
         {
             constexpr uint32_t index = detail::shuffle(2 * V0, 2 * V0 + 1, 2 * V1, 2 * V1 + 1);
             return _mm_shuffle_epi32(self, index);
         }
 
         template <class A, uint64_t V0, uint64_t V1>
-        XSIMD_INLINE batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<uint64_t, A, V0, V1> mask, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<uint64_t, A, V0, V1> mask, requires_arch<altivec>) noexcept
         {
-            return bitwise_cast<int64_t>(swizzle(bitwise_cast<uint64_t>(self), mask, sse2 {}));
+            return bitwise_cast<int64_t>(swizzle(bitwise_cast<uint64_t>(self), mask, altivec {}));
         }
 
         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
-        XSIMD_INLINE batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3>, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3>, requires_arch<altivec>) noexcept
         {
             constexpr uint32_t index = detail::shuffle(V0, V1, V2, V3);
             return _mm_shuffle_epi32(self, index);
         }
 
         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
-        XSIMD_INLINE batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3> mask, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3> mask, requires_arch<altivec>) noexcept
         {
-            return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, sse2 {}));
+            return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, altivec {}));
         }
 
         template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
-        XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<altivec>) noexcept
         {
             // permute within each lane
             constexpr auto mask_lo = detail::mod_shuffle(V0, V1, V2, V3);
@@ -1538,14 +1468,14 @@ namespace xsimd
         }
 
         template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
-        XSIMD_INLINE batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<altivec>) noexcept
         {
-            return bitwise_cast<int16_t>(swizzle(bitwise_cast<uint16_t>(self), mask, sse2 {}));
+            return bitwise_cast<int16_t>(swizzle(bitwise_cast<uint16_t>(self), mask, altivec {}));
         }
 
         // transpose
         template <class A>
-        XSIMD_INLINE void transpose(batch<float, A>* matrix_begin, batch<float, A>* matrix_end, requires_arch<sse2>) noexcept
+        XSIMD_INLINE void transpose(batch<float, A>* matrix_begin, batch<float, A>* matrix_end, requires_arch<altivec>) noexcept
         {
             assert((matrix_end - matrix_begin == batch<float, A>::size) && "correctly sized matrix");
             (void)matrix_end;
@@ -1557,18 +1487,18 @@ namespace xsimd
             matrix_begin[3] = r3;
         }
         template <class A>
-        XSIMD_INLINE void transpose(batch<uint32_t, A>* matrix_begin, batch<uint32_t, A>* matrix_end, requires_arch<sse2>) noexcept
+        XSIMD_INLINE void transpose(batch<uint32_t, A>* matrix_begin, batch<uint32_t, A>* matrix_end, requires_arch<altivec>) noexcept
         {
             transpose(reinterpret_cast<batch<float, A>*>(matrix_begin), reinterpret_cast<batch<float, A>*>(matrix_end), A {});
         }
         template <class A>
-        XSIMD_INLINE void transpose(batch<int32_t, A>* matrix_begin, batch<int32_t, A>* matrix_end, requires_arch<sse2>) noexcept
+        XSIMD_INLINE void transpose(batch<int32_t, A>* matrix_begin, batch<int32_t, A>* matrix_end, requires_arch<altivec>) noexcept
         {
             transpose(reinterpret_cast<batch<float, A>*>(matrix_begin), reinterpret_cast<batch<float, A>*>(matrix_end), A {});
         }
 
         template <class A>
-        XSIMD_INLINE void transpose(batch<double, A>* matrix_begin, batch<double, A>* matrix_end, requires_arch<sse2>) noexcept
+        XSIMD_INLINE void transpose(batch<double, A>* matrix_begin, batch<double, A>* matrix_end, requires_arch<altivec>) noexcept
         {
             assert((matrix_end - matrix_begin == batch<double, A>::size) && "correctly sized matrix");
             (void)matrix_end;
@@ -1577,24 +1507,24 @@ namespace xsimd
             matrix_begin[1] = _mm_unpackhi_pd(r0, r1);
         }
         template <class A>
-        XSIMD_INLINE void transpose(batch<uint64_t, A>* matrix_begin, batch<uint64_t, A>* matrix_end, requires_arch<sse2>) noexcept
+        XSIMD_INLINE void transpose(batch<uint64_t, A>* matrix_begin, batch<uint64_t, A>* matrix_end, requires_arch<altivec>) noexcept
         {
             transpose(reinterpret_cast<batch<double, A>*>(matrix_begin), reinterpret_cast<batch<double, A>*>(matrix_end), A {});
         }
         template <class A>
-        XSIMD_INLINE void transpose(batch<int64_t, A>* matrix_begin, batch<int64_t, A>* matrix_end, requires_arch<sse2>) noexcept
+        XSIMD_INLINE void transpose(batch<int64_t, A>* matrix_begin, batch<int64_t, A>* matrix_end, requires_arch<altivec>) noexcept
         {
             transpose(reinterpret_cast<batch<double, A>*>(matrix_begin), reinterpret_cast<batch<double, A>*>(matrix_end), A {});
         }
 
         // zip_hi
         template <class A>
-        XSIMD_INLINE batch<float, A> zip_hi(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<float, A> zip_hi(batch<float, A> const& self, batch<float, A> const& other, requires_arch<altivec>) noexcept
         {
             return _mm_unpackhi_ps(self, other);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
             {
@@ -1619,19 +1549,19 @@ namespace xsimd
             }
         }
         template <class A>
-        XSIMD_INLINE batch<double, A> zip_hi(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<double, A> zip_hi(batch<double, A> const& self, batch<double, A> const& other, requires_arch<altivec>) noexcept
         {
             return _mm_unpackhi_pd(self, other);
         }
 
         // zip_lo
         template <class A>
-        XSIMD_INLINE batch<float, A> zip_lo(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<float, A> zip_lo(batch<float, A> const& self, batch<float, A> const& other, requires_arch<altivec>) noexcept
         {
             return _mm_unpacklo_ps(self, other);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
             {
@@ -1656,7 +1586,7 @@ namespace xsimd
             }
         }
         template <class A>
-        XSIMD_INLINE batch<double, A> zip_lo(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        XSIMD_INLINE batch<double, A> zip_lo(batch<double, A> const& self, batch<double, A> const& other, requires_arch<altivec>) noexcept
         {
             return _mm_unpacklo_pd(self, other);
         }

From f2246a447d50051003a1452354558b6567798143 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Sun, 13 Jul 2025 17:51:37 +0200
Subject: [PATCH 08/57] WIP

---
 .github/workflows/cross-ppc.yml      |  2 +-
 include/xsimd/arch/xsimd_altivec.hpp | 46 +++-------------------------
 2 files changed, 5 insertions(+), 43 deletions(-)

diff --git a/.github/workflows/cross-ppc.yml b/.github/workflows/cross-ppc.yml
index f63383a8f..ec7a11f04 100644
--- a/.github/workflows/cross-ppc.yml
+++ b/.github/workflows/cross-ppc.yml
@@ -36,7 +36,7 @@ jobs:
         mkdir _build
         cd _build && cmake .. -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DBUILD_BENCHMARK=${{ matrix.target.full }} -DBUILD_EXAMPLES=${{ matrix.target.full }} -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_FLAGS="${{ matrix.target.flags }}" -DCMAKE_CXX_FLAGS="${{ matrix.target.flags }}" -DCMAKE_TOOLCHAIN_FILE=${{ github.workspace }}/.github/toolchains/${{ matrix.sys.compiler }}-${{ matrix.target.dir }}.cmake
     - name: Build
-      run: cmake --build _build --verbose
+      run: cmake --build _build --verbose -j1
     - name: Testing xsimd
       run: |
         qemu-${{ matrix.target.platform }} -cpu 7400 -L /usr/${{ matrix.target.dir}}/ ./test/test_xsimd
diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp
index 997fb5944..ce6c2a30f 100644
--- a/include/xsimd/arch/xsimd_altivec.hpp
+++ b/include/xsimd/arch/xsimd_altivec.hpp
@@ -567,53 +567,15 @@ namespace xsimd
         {
             return _mm_castsi128_pd(_mm_cmpeq_epi32(_mm_castpd_si128(self), _mm_castpd_si128(other)));
         }
+#endif
 
         // first
-        template <class A>
-        XSIMD_INLINE float first(batch<float, A> const& self, requires_arch<altivec>) noexcept
-        {
-            return _mm_cvtss_f32(self);
-        }
-
-        template <class A>
-        XSIMD_INLINE double first(batch<double, A> const& self, requires_arch<altivec>) noexcept
-        {
-            return _mm_cvtsd_f64(self);
-        }
-
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE T first(batch<T, A> const& self, requires_arch<altivec>) noexcept
         {
-            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-            {
-                return static_cast<T>(_mm_cvtsi128_si32(self) & 0xFF);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-            {
-                return static_cast<T>(_mm_cvtsi128_si32(self) & 0xFFFF);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
-            {
-                return static_cast<T>(_mm_cvtsi128_si32(self));
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
-            {
-#if defined(__x86_64__)
-                return static_cast<T>(_mm_cvtsi128_si64(self));
-#else
-                __m128i m;
-                _mm_storel_epi64(&m, self);
-                int64_t i;
-                std::memcpy(&i, &m, sizeof(i));
-                return i;
-#endif
-            }
-            else
-            {
-                assert(false && "unsupported arch/op combination");
-                return {};
-            }
+           return vec_extract(self, 0);
         }
+#if 0
 
         // from_mask
         template <class A>

From f90e5e81d610e5c897d1872e21637fb447af80ab Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Sun, 13 Jul 2025 17:55:33 +0200
Subject: [PATCH 09/57] WIP

---
 test/test_utils.hpp | 13 +++++++++++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/test/test_utils.hpp b/test/test_utils.hpp
index f295a8be6..dff6ede09 100644
--- a/test/test_utils.hpp
+++ b/test/test_utils.hpp
@@ -571,6 +571,14 @@ namespace xsimd
  * Testing types lists *
  ***********************/
 
+#ifdef XSIMD_WITH_ALTIVEC
+#define BATCH_INT_TYPES xsimd::batch<uint8_t>,  \
+                        xsimd::batch<int8_t>,   \
+                        xsimd::batch<uint16_t>, \
+                        xsimd::batch<int16_t>,  \
+                        xsimd::batch<uint32_t>, \
+                        xsimd::batch<int32_t>
+#else
 #define BATCH_INT_TYPES xsimd::batch<uint8_t>,  \
                         xsimd::batch<int8_t>,   \
                         xsimd::batch<uint16_t>, \
@@ -579,13 +587,14 @@ namespace xsimd
                         xsimd::batch<int32_t>,  \
                         xsimd::batch<uint64_t>, \
                         xsimd::batch<int64_t>
+#endif
 
-#if XSIMD_WITH_NEON64 || !XSIMD_WITH_NEON
+#if XSIMD_WITH_NEON64 || (!XSIMD_WITH_NEON && !XSIMD_WITH_ALTIVEC)
 #define BATCH_FLOAT_TYPES xsimd::batch<float>, xsimd::batch<double>
 #else
 #define BATCH_FLOAT_TYPES xsimd::batch<float>
 #endif
-#if XSIMD_WITH_NEON64 || !XSIMD_WITH_NEON
+#if XSIMD_WITH_NEON64 || (!XSIMD_WITH_NEON && !XSIMD_WITH_ALTIVEC)
 #define BATCH_COMPLEX_TYPES xsimd::batch<std::complex<float>>, xsimd::batch<std::complex<double>>
 #else
 #define BATCH_COMPLEX_TYPES xsimd::batch<std::complex<float>>

From 6e5978ca7720d93c620f18070da8e70d9dfa54db Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Sun, 13 Jul 2025 18:08:22 +0200
Subject: [PATCH 10/57] WIP

---
 include/xsimd/arch/xsimd_altivec.hpp | 84 +++++-----------------------
 1 file changed, 14 insertions(+), 70 deletions(-)

diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp
index ce6c2a30f..b5809b7af 100644
--- a/include/xsimd/arch/xsimd_altivec.hpp
+++ b/include/xsimd/arch/xsimd_altivec.hpp
@@ -179,12 +179,12 @@ namespace xsimd
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return vec_andc(other, self);
+            return vec_nand(self, other);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         XSIMD_INLINE batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return vec_andc(other, self);
+            return vec_nand(self, other);
         }
 
 #if 0
@@ -343,38 +343,18 @@ namespace xsimd
             }
         }
 
+#endif
         // bitwise_xor
-        template <class A>
-        XSIMD_INLINE batch<float, A> bitwise_xor(batch<float, A> const& self, batch<float, A> const& other, requires_arch<altivec>) noexcept
-        {
-            return _mm_xor_ps(self, other);
-        }
-        template <class A>
-        XSIMD_INLINE batch_bool<float, A> bitwise_xor(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<altivec>) noexcept
-        {
-            return _mm_xor_ps(self, other);
-        }
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return _mm_xor_si128(self, other);
-        }
-        template <class A>
-        XSIMD_INLINE batch<double, A> bitwise_xor(batch<double, A> const& self, batch<double, A> const& other, requires_arch<altivec>) noexcept
-        {
-            return _mm_xor_pd(self, other);
+            return vec_xor(self, other);
         }
-        template <class A>
-        XSIMD_INLINE batch_bool<double, A> bitwise_xor(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<altivec>) noexcept
-        {
-            return _mm_xor_pd(self, other);
-        }
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return _mm_xor_si128(self, other);
+            return vec_xor(self, other);
         }
-#endif
 
         // bitwise_cast
         template <class A, class T_in, class T_out>
@@ -383,44 +363,13 @@ namespace xsimd
             return *reinterpret_cast<typename batch<T_out, A>::register_type const*>(&self.data);
         }
 
-#if 0
-
         // broadcast
-        template <class A>
-        batch<float, A> XSIMD_INLINE broadcast(float val, requires_arch<altivec>) noexcept
-        {
-            return _mm_set1_ps(val);
-        }
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<altivec>) noexcept
         {
-            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-            {
-                return _mm_set1_epi8(val);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-            {
-                return _mm_set1_epi16(val);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
-            {
-                return _mm_set1_epi32(val);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
-            {
-                return _mm_set1_epi64x(val);
-            }
-            else
-            {
-                assert(false && "unsupported arch/op combination");
-                return {};
-            }
-        }
-        template <class A>
-        XSIMD_INLINE batch<double, A> broadcast(double val, requires_arch<altivec>) noexcept
-        {
-            return _mm_set1_pd(val);
+            return vec_splats(val);
         }
+#if 0
 
         // store_complex
         namespace detail
@@ -573,7 +522,7 @@ namespace xsimd
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE T first(batch<T, A> const& self, requires_arch<altivec>) noexcept
         {
-           return vec_extract(self, 0);
+            return vec_extract(self, 0);
         }
 #if 0
 
@@ -752,20 +701,15 @@ namespace xsimd
         {
             return self - batch<T, A>(mask.data);
         }
+#endif
 
         // insert
         template <class A, class T, size_t I, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I> pos, requires_arch<altivec>) noexcept
         {
-            XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-            {
-                return _mm_insert_epi16(self, val, I);
-            }
-            else
-            {
-                return insert(self, val, pos, common {});
-            }
+            return vec_insert(val, self, pos);
         }
+#if 0
 
         // isnan
         template <class A>

From 153c0580f192347573402f57ec924a67800f4a59 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Sun, 13 Jul 2025 18:46:56 +0200
Subject: [PATCH 11/57] set

---
 include/xsimd/arch/xsimd_altivec.hpp | 58 ++--------------------------
 1 file changed, 4 insertions(+), 54 deletions(-)

diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp
index b5809b7af..b0e53d5c1 100644
--- a/include/xsimd/arch/xsimd_altivec.hpp
+++ b/include/xsimd/arch/xsimd_altivec.hpp
@@ -1216,65 +1216,15 @@ namespace xsimd
         {
             return vec_adds(self, other);
         }
-#if 0
 
         // set
-        template <class A, class... Values>
-        XSIMD_INLINE batch<float, A> set(batch<float, A> const&, requires_arch<altivec>, Values... values) noexcept
-        {
-            static_assert(sizeof...(Values) == batch<float, A>::size, "consistent init");
-            return _mm_setr_ps(values...);
-        }
-
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<altivec>, T v0, T v1) noexcept
-        {
-            return _mm_set_epi64x(v1, v0);
-        }
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<altivec>, T v0, T v1, T v2, T v3) noexcept
-        {
-            return _mm_setr_epi32(v0, v1, v2, v3);
-        }
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<altivec>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) noexcept
-        {
-            return _mm_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7);
-        }
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<altivec>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) noexcept
+        template <class A, class T, class... Values>
+        XSIMD_INLINE batch<float, A> set(batch<T, A> const&, requires_arch<altivec>, Values... values) noexcept
         {
-            return _mm_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
+            static_assert(sizeof...(Values) == batch<T, A>::size, "consistent init");
+            return typename batch<T, A>::register_type { values... };
         }
 
-        template <class A, class... Values>
-        XSIMD_INLINE batch<double, A> set(batch<double, A> const&, requires_arch<altivec>, Values... values) noexcept
-        {
-            static_assert(sizeof...(Values) == batch<double, A>::size, "consistent init");
-            return _mm_setr_pd(values...);
-        }
-
-        template <class A, class T, class... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<altivec>, Values... values) noexcept
-        {
-            return set(batch<T, A>(), A {}, static_cast<T>(values ? -1LL : 0LL)...).data;
-        }
-
-        template <class A, class... Values>
-        XSIMD_INLINE batch_bool<float, A> set(batch_bool<float, A> const&, requires_arch<altivec>, Values... values) noexcept
-        {
-            static_assert(sizeof...(Values) == batch_bool<float, A>::size, "consistent init");
-            return _mm_castsi128_ps(set(batch<int32_t, A>(), A {}, static_cast<int32_t>(values ? -1LL : 0LL)...).data);
-        }
-
-        template <class A, class... Values>
-        XSIMD_INLINE batch_bool<double, A> set(batch_bool<double, A> const&, requires_arch<altivec>, Values... values) noexcept
-        {
-            static_assert(sizeof...(Values) == batch_bool<double, A>::size, "consistent init");
-            return _mm_castsi128_pd(set(batch<int64_t, A>(), A {}, static_cast<int64_t>(values ? -1LL : 0LL)...).data);
-        }
-#endif
-
         // ssub
 
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>

From ae4f6fe0b20d720afc9c604f26db36ace535adad Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Sun, 13 Jul 2025 18:57:49 +0200
Subject: [PATCH 12/57] eq

---
 include/xsimd/arch/xsimd_altivec.hpp | 116 ++++-----------------------
 1 file changed, 16 insertions(+), 100 deletions(-)

diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp
index b0e53d5c1..c77f6449e 100644
--- a/include/xsimd/arch/xsimd_altivec.hpp
+++ b/include/xsimd/arch/xsimd_altivec.hpp
@@ -81,42 +81,21 @@ namespace xsimd
             return vec_add(self, other);
         }
 
-#if 0
-
         // all
-        template <class A>
-        XSIMD_INLINE bool all(batch_bool<float, A> const& self, requires_arch<altivec>) noexcept
-        {
-            return _mm_movemask_ps(self) == 0x0F;
-        }
-        template <class A>
-        XSIMD_INLINE bool all(batch_bool<double, A> const& self, requires_arch<altivec>) noexcept
-        {
-            return _mm_movemask_pd(self) == 0x03;
-        }
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE bool all(batch_bool<T, A> const& self, requires_arch<altivec>) noexcept
         {
-            return _mm_movemask_epi8(self) == 0xFFFF;
+            return vec_all_ne(self, vec_xor(self, self));
         }
 
         // any
-        template <class A>
-        XSIMD_INLINE bool any(batch_bool<float, A> const& self, requires_arch<altivec>) noexcept
-        {
-            return _mm_movemask_ps(self) != 0;
-        }
-        template <class A>
-        XSIMD_INLINE bool any(batch_bool<double, A> const& self, requires_arch<altivec>) noexcept
-        {
-            return _mm_movemask_pd(self) != 0;
-        }
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE bool any(batch_bool<T, A> const& self, requires_arch<altivec>) noexcept
         {
-            return _mm_movemask_epi8(self) != 0;
+            return vec_any_ne(self, vec_xor(self, self));
         }
 
+#if 0
         // avgr
         template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> avgr(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
@@ -460,63 +439,19 @@ namespace xsimd
                 return _mm_cvttps_epi32(self);
             }
         }
+#endif
 
         // eq
-        template <class A>
-        XSIMD_INLINE batch_bool<float, A> eq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<altivec>) noexcept
-        {
-            return _mm_cmpeq_ps(self, other);
-        }
-        template <class A>
-        XSIMD_INLINE batch_bool<float, A> eq(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<altivec>) noexcept
-        {
-            return _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(self), _mm_castps_si128(other)));
-        }
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-            {
-                return _mm_cmpeq_epi8(self, other);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-            {
-                return _mm_cmpeq_epi16(self, other);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
-            {
-                return _mm_cmpeq_epi32(self, other);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
-            {
-                __m128i tmp1 = _mm_cmpeq_epi32(self, other);
-                __m128i tmp2 = _mm_shuffle_epi32(tmp1, 0xB1);
-                __m128i tmp3 = _mm_and_si128(tmp1, tmp2);
-                __m128i tmp4 = _mm_srai_epi32(tmp3, 31);
-                return _mm_shuffle_epi32(tmp4, 0xF5);
-            }
-            else
-            {
-                assert(false && "unsupported arch/op combination");
-                return {};
-            }
+            return vec_cmpeq(self, other);
         }
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return ~(self != other);
+            return vec_cmpeq(self, other);
         }
-        template <class A>
-        XSIMD_INLINE batch_bool<double, A> eq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<altivec>) noexcept
-        {
-            return _mm_cmpeq_pd(self, other);
-        }
-        template <class A>
-        XSIMD_INLINE batch_bool<double, A> eq(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<altivec>) noexcept
-        {
-            return _mm_castsi128_pd(_mm_cmpeq_epi32(_mm_castpd_si128(self), _mm_castpd_si128(other)));
-        }
-#endif
 
         // first
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
@@ -987,39 +922,20 @@ namespace xsimd
             return _mm_xor_pd(
                 self, _mm_castsi128_pd(_mm_setr_epi32(0, 0x80000000, 0, 0x80000000)));
         }
+#endif
 
         // neq
-        template <class A>
-        XSIMD_INLINE batch_bool<float, A> neq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<altivec>) noexcept
-        {
-            return _mm_cmpneq_ps(self, other);
-        }
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return ~(self == other);
+            return vec_cmpne(self, other);
         }
-        template <class A>
-        XSIMD_INLINE batch_bool<float, A> neq(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<altivec>) noexcept
-        {
-            return _mm_xor_ps(self, other);
-        }
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch_bool<T, A> neq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(self.data), _mm_castsi128_ps(other.data)));
-        }
-
-        template <class A>
-        XSIMD_INLINE batch_bool<double, A> neq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<altivec>) noexcept
-        {
-            return _mm_cmpneq_pd(self, other);
-        }
-        template <class A>
-        XSIMD_INLINE batch_bool<double, A> neq(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<altivec>) noexcept
-        {
-            return _mm_xor_pd(self, other);
+            return vec_cmpne(self, other);
         }
+#if 0
 
         // reciprocal
         template <class A>

From fc7d26f16a8024c5e5b370b78f449a506ac78a3b Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Sun, 13 Jul 2025 19:59:01 +0200
Subject: [PATCH 13/57] WIP

---
 include/xsimd/arch/xsimd_altivec.hpp | 337 ++++-----------------------
 1 file changed, 47 insertions(+), 290 deletions(-)

diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp
index c77f6449e..5f07474f3 100644
--- a/include/xsimd/arch/xsimd_altivec.hpp
+++ b/include/xsimd/arch/xsimd_altivec.hpp
@@ -166,163 +166,48 @@ namespace xsimd
             return vec_nand(self, other);
         }
 
-#if 0
         // bitwise_lshift
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<altivec>) noexcept
         {
-            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-            {
-                return _mm_and_si128(_mm_set1_epi8(0xFF << other), _mm_slli_epi32(self, other));
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-            {
-                return _mm_slli_epi16(self, other);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
-            {
-                return _mm_slli_epi32(self, other);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
-            {
-                return _mm_slli_epi64(self, other);
-            }
-            else
-            {
-                assert(false && "unsupported arch/op combination");
-                return {};
-            }
+            using shift_type = as_unsigned_integer_t<T>;
+            batch<shift_type, A> shift(static_cast<shift_type>(other));
+            return vec_sl(self, shift);
         }
 
         // bitwise_not
-        template <class A>
-        XSIMD_INLINE batch<float, A> bitwise_not(batch<float, A> const& self, requires_arch<altivec>) noexcept
-        {
-            return _mm_xor_ps(self, _mm_castsi128_ps(_mm_set1_epi32(-1)));
-        }
-        template <class A>
-        XSIMD_INLINE batch_bool<float, A> bitwise_not(batch_bool<float, A> const& self, requires_arch<altivec>) noexcept
-        {
-            return _mm_xor_ps(self, _mm_castsi128_ps(_mm_set1_epi32(-1)));
-        }
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<altivec>) noexcept
         {
-            return _mm_xor_si128(self, _mm_set1_epi32(-1));
+            return vec_nor(self, self);
         }
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<altivec>) noexcept
         {
-            return _mm_xor_si128(self, _mm_set1_epi32(-1));
-        }
-        template <class A>
-        XSIMD_INLINE batch<double, A> bitwise_not(batch<double, A> const& self, requires_arch<altivec>) noexcept
-        {
-            return _mm_xor_pd(self, _mm_castsi128_pd(_mm_set1_epi32(-1)));
-        }
-        template <class A>
-        XSIMD_INLINE batch_bool<double, A> bitwise_not(batch_bool<double, A> const& self, requires_arch<altivec>) noexcept
-        {
-            return _mm_xor_pd(self, _mm_castsi128_pd(_mm_set1_epi32(-1)));
+            return vec_nor(self, self);
         }
 
         // bitwise_or
-        template <class A>
-        XSIMD_INLINE batch<float, A> bitwise_or(batch<float, A> const& self, batch<float, A> const& other, requires_arch<altivec>) noexcept
-        {
-            return _mm_or_ps(self, other);
-        }
-        template <class A>
-        XSIMD_INLINE batch_bool<float, A> bitwise_or(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<altivec>) noexcept
-        {
-            return _mm_or_ps(self, other);
-        }
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return _mm_or_si128(self, other);
+            return vec_or(self, other);
         }
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return _mm_or_si128(self, other);
-        }
-
-        template <class A>
-        XSIMD_INLINE batch<double, A> bitwise_or(batch<double, A> const& self, batch<double, A> const& other, requires_arch<altivec>) noexcept
-        {
-            return _mm_or_pd(self, other);
-        }
-
-        template <class A>
-        XSIMD_INLINE batch_bool<double, A> bitwise_or(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<altivec>) noexcept
-        {
-            return _mm_or_pd(self, other);
+            return vec_or(self, other);
         }
 
         // bitwise_rshift
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<altivec>) noexcept
         {
-            if (std::is_signed<T>::value)
-            {
-                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-                {
-                    __m128i sign_mask = _mm_set1_epi16((0xFF00 >> other) & 0x00FF);
-                    __m128i cmp_is_negative = _mm_cmpgt_epi8(_mm_setzero_si128(), self);
-                    __m128i res = _mm_srai_epi16(self, other);
-                    return _mm_or_si128(_mm_and_si128(sign_mask, cmp_is_negative), _mm_andnot_si128(sign_mask, res));
-                }
-                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-                {
-                    return _mm_srai_epi16(self, other);
-                }
-                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
-                {
-                    return _mm_srai_epi32(self, other);
-                }
-                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
-                {
-                    // from https://github.com/samyvilar/vect/blob/master/vect_128.h
-                    return _mm_or_si128(
-                        _mm_srli_epi64(self, other),
-                        _mm_slli_epi64(
-                            _mm_srai_epi32(_mm_shuffle_epi32(self, _MM_SHUFFLE(3, 3, 1, 1)), 32),
-                            64 - other));
-                }
-                else
-                {
-                    assert(false && "unsupported arch/op combination");
-                    return {};
-                }
-            }
-            else
-            {
-                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-                {
-                    return _mm_and_si128(_mm_set1_epi8(0xFF >> other), _mm_srli_epi32(self, other));
-                }
-                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-                {
-                    return _mm_srli_epi16(self, other);
-                }
-                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
-                {
-                    return _mm_srli_epi32(self, other);
-                }
-                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
-                {
-                    return _mm_srli_epi64(self, other);
-                }
-                else
-                {
-                    assert(false && "unsupported arch/op combination");
-                    return {};
-                }
-            }
+            using shift_type = as_unsigned_integer_t<T>;
+            batch<shift_type, A> shift(static_cast<shift_type>(other));
+            return vec_sr(self, shift);
         }
 
-#endif
         // bitwise_xor
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
@@ -556,59 +441,33 @@ namespace xsimd
                 return _mm_castpd_si128(from_mask(batch_bool<double, A> {}, mask, altivec {}));
             }
         }
-
+#endif
         // ge
-        template <class A>
-        XSIMD_INLINE batch_bool<float, A> ge(batch<float, A> const& self, batch<float, A> const& other, requires_arch<altivec>) noexcept
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return _mm_cmpge_ps(self, other);
+            return vec_cmpge(self, other);
         }
         template <class A>
         XSIMD_INLINE batch_bool<double, A> ge(batch<double, A> const& self, batch<double, A> const& other, requires_arch<altivec>) noexcept
         {
-            return _mm_cmpge_pd(self, other);
+            return vec_cmpge(self, other);
         }
 
         // gt
-        template <class A>
-        XSIMD_INLINE batch_bool<float, A> gt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<altivec>) noexcept
-        {
-            return _mm_cmpgt_ps(self, other);
-        }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            if (std::is_signed<T>::value)
-            {
-                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-                {
-                    return _mm_cmpgt_epi8(self, other);
-                }
-                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-                {
-                    return _mm_cmpgt_epi16(self, other);
-                }
-                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
-                {
-                    return _mm_cmpgt_epi32(self, other);
-                }
-                else
-                {
-                    return gt(self, other, common {});
-                }
-            }
-            else
-            {
-                return gt(self, other, common {});
-            }
+            return vec_cmpgt(self, other);
         }
-
         template <class A>
         XSIMD_INLINE batch_bool<double, A> gt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<altivec>) noexcept
         {
-            return _mm_cmpgt_pd(self, other);
+            return vec_cmpgt(self, other);
         }
 
+#if 0
+
         // haddp
         template <class A>
         XSIMD_INLINE batch<float, A> haddp(batch<float, A> const* row, requires_arch<altivec>) noexcept
@@ -644,21 +503,14 @@ namespace xsimd
         {
             return vec_insert(val, self, pos);
         }
-#if 0
 
         // isnan
         template <class A>
         XSIMD_INLINE batch_bool<float, A> isnan(batch<float, A> const& self, requires_arch<altivec>) noexcept
         {
-            return _mm_cmpunord_ps(self, self);
-        }
-        template <class A>
-        XSIMD_INLINE batch_bool<double, A> isnan(batch<double, A> const& self, requires_arch<altivec>) noexcept
-        {
-            return _mm_cmpunord_pd(self, self);
+            return ~vec_cmpeq(self, self);
         }
 
-#endif
         // load_aligned
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> load_aligned(T const* mem, convert<T>, requires_arch<altivec>) noexcept
@@ -690,99 +542,35 @@ namespace xsimd
             }
         }
 #endif
-#if 0
 
         // le
-        template <class A>
-        XSIMD_INLINE batch_bool<float, A> le(batch<float, A> const& self, batch<float, A> const& other, requires_arch<altivec>) noexcept
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return _mm_cmple_ps(self, other);
+            return vec_cmple(self, other);
         }
         template <class A>
         XSIMD_INLINE batch_bool<double, A> le(batch<double, A> const& self, batch<double, A> const& other, requires_arch<altivec>) noexcept
         {
-            return _mm_cmple_pd(self, other);
+            return vec_cmple(self, other);
         }
 
         // lt
-        template <class A>
-        XSIMD_INLINE batch_bool<float, A> lt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<altivec>) noexcept
-        {
-            return _mm_cmplt_ps(self, other);
-        }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            if (std::is_signed<T>::value)
-            {
-                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-                {
-                    return _mm_cmplt_epi8(self, other);
-                }
-                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-                {
-                    return _mm_cmplt_epi16(self, other);
-                }
-                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
-                {
-                    return _mm_cmplt_epi32(self, other);
-                }
-                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
-                {
-                    __m128i tmp1 = _mm_sub_epi64(self, other);
-                    __m128i tmp2 = _mm_xor_si128(self, other);
-                    __m128i tmp3 = _mm_andnot_si128(other, self);
-                    __m128i tmp4 = _mm_andnot_si128(tmp2, tmp1);
-                    __m128i tmp5 = _mm_or_si128(tmp3, tmp4);
-                    __m128i tmp6 = _mm_srai_epi32(tmp5, 31);
-                    return _mm_shuffle_epi32(tmp6, 0xF5);
-                }
-                else
-                {
-                    assert(false && "unsupported arch/op combination");
-                    return {};
-                }
-            }
-            else
-            {
-                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-                {
-                    return _mm_cmplt_epi8(_mm_xor_si128(self, _mm_set1_epi8(std::numeric_limits<int8_t>::lowest())), _mm_xor_si128(other, _mm_set1_epi8(std::numeric_limits<int8_t>::lowest())));
-                }
-                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-                {
-                    return _mm_cmplt_epi16(_mm_xor_si128(self, _mm_set1_epi16(std::numeric_limits<int16_t>::lowest())), _mm_xor_si128(other, _mm_set1_epi16(std::numeric_limits<int16_t>::lowest())));
-                }
-                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
-                {
-                    return _mm_cmplt_epi32(_mm_xor_si128(self, _mm_set1_epi32(std::numeric_limits<int32_t>::lowest())), _mm_xor_si128(other, _mm_set1_epi32(std::numeric_limits<int32_t>::lowest())));
-                }
-                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
-                {
-                    auto xself = _mm_xor_si128(self, _mm_set1_epi64x(std::numeric_limits<int64_t>::lowest()));
-                    auto xother = _mm_xor_si128(other, _mm_set1_epi64x(std::numeric_limits<int64_t>::lowest()));
-                    __m128i tmp1 = _mm_sub_epi64(xself, xother);
-                    __m128i tmp2 = _mm_xor_si128(xself, xother);
-                    __m128i tmp3 = _mm_andnot_si128(xother, xself);
-                    __m128i tmp4 = _mm_andnot_si128(tmp2, tmp1);
-                    __m128i tmp5 = _mm_or_si128(tmp3, tmp4);
-                    __m128i tmp6 = _mm_srai_epi32(tmp5, 31);
-                    return _mm_shuffle_epi32(tmp6, 0xF5);
-                }
-                else
-                {
-                    assert(false && "unsupported arch/op combination");
-                    return {};
-                }
-            }
+            return vec_cmplt(self, other);
         }
-
         template <class A>
         XSIMD_INLINE batch_bool<double, A> lt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<altivec>) noexcept
         {
-            return _mm_cmplt_pd(self, other);
+            return vec_cmplt(self, other);
         }
 
+#if 0
+
+
+
         /* compression table to turn 0b10 into 0b1,
          * 0b100010 into 0b101 etc
          */
@@ -853,40 +641,21 @@ namespace xsimd
             return _mm_movemask_pd(self);
         }
 
+#endif
+
         // max
-        template <class A>
-        XSIMD_INLINE batch<float, A> max(batch<float, A> const& self, batch<float, A> const& other, requires_arch<altivec>) noexcept
-        {
-            return _mm_max_ps(self, other);
-        }
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return select(self > other, self, other);
-        }
-        template <class A>
-        XSIMD_INLINE batch<double, A> max(batch<double, A> const& self, batch<double, A> const& other, requires_arch<altivec>) noexcept
-        {
-            return _mm_max_pd(self, other);
+            return vec_max(self, other);
         }
 
         // min
-        template <class A>
-        XSIMD_INLINE batch<float, A> min(batch<float, A> const& self, batch<float, A> const& other, requires_arch<altivec>) noexcept
-        {
-            return _mm_min_ps(self, other);
-        }
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return select(self <= other, self, other);
+            return vec_min(self, other);
         }
-        template <class A>
-        XSIMD_INLINE batch<double, A> min(batch<double, A> const& self, batch<double, A> const& other, requires_arch<altivec>) noexcept
-        {
-            return _mm_min_pd(self, other);
-        }
-#endif
 
         // mul
         template <class A, class T, typename std::enable_if<std::is_scalar<T>::value, void>::type>
@@ -903,48 +672,36 @@ namespace xsimd
         {
             return _mm_cvtps_epi32(self);
         }
+#endif
 
         // neg
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> neg(batch<T, A> const& self, requires_arch<altivec>) noexcept
         {
-            return 0 - self;
-        }
-        template <class A>
-        XSIMD_INLINE batch<float, A> neg(batch<float, A> const& self, requires_arch<altivec>) noexcept
-        {
-            return _mm_xor_ps(self, _mm_castsi128_ps(_mm_set1_epi32(0x80000000)));
+            return vec_neg(self);
         }
 
-        template <class A>
-        XSIMD_INLINE batch<double, A> neg(batch<double, A> const& self, requires_arch<altivec>) noexcept
-        {
-            return _mm_xor_pd(
-                self, _mm_castsi128_pd(_mm_setr_epi32(0, 0x80000000, 0, 0x80000000)));
-        }
-#endif
-
         // neq
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return vec_cmpne(self, other);
+            return ~vec_cmpeq(self, other);
         }
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch_bool<T, A> neq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return vec_cmpne(self, other);
+            return ~vec_cmpeq(self, other);
         }
-#if 0
 
         // reciprocal
         template <class A>
         XSIMD_INLINE batch<float, A> reciprocal(batch<float, A> const& self,
                                                 kernel::requires_arch<altivec>)
         {
-            return _mm_rcp_ps(self);
+            return vec_re(self);
         }
 
+#if 0
         // reduce_add
         template <class A>
         XSIMD_INLINE float reduce_add(batch<float, A> const& self, requires_arch<altivec>) noexcept

From f6a2a1bc8b13eff992c6ac3f80de848c8662a6e9 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Sun, 13 Jul 2025 20:19:07 +0200
Subject: [PATCH 14/57] WIP

---
 include/xsimd/arch/xsimd_altivec.hpp | 60 +++++++---------------------
 1 file changed, 14 insertions(+), 46 deletions(-)

diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp
index 5f07474f3..d1f3047ef 100644
--- a/include/xsimd/arch/xsimd_altivec.hpp
+++ b/include/xsimd/arch/xsimd_altivec.hpp
@@ -95,45 +95,21 @@ namespace xsimd
             return vec_any_ne(self, vec_xor(self, self));
         }
 
-#if 0
         // avgr
-        template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type>
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> avgr(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-            {
-                return _mm_avg_epu8(self, other);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-            {
-                return _mm_avg_epu16(self, other);
-            }
-            else
-            {
-                return avgr(self, other, common {});
-            }
+            return vec_avg(self, other);
         }
 
         // avg
-        template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type>
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> avg(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-            {
-                auto adj = ((self ^ other) << 7) >> 7;
-                return avgr(self, other, A {}) - adj;
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-            {
-                auto adj = ((self ^ other) << 15) >> 15;
-                return avgr(self, other, A {}) - adj;
-            }
-            else
-            {
-                return avg(self, other, common {});
-            }
+            constexpr auto nbit = 8 * sizeof(T) - 1;
+            constexpr auto adj = ((self ^ other) << nbit) >> nbit;
+            return avgr(self, other, A {}) - adj;
         }
-#endif
 
         // batch_bool_cast
         template <class A, class T_out, class T_in>
@@ -482,12 +458,7 @@ namespace xsimd
             tmp0 = _mm_movelh_ps(tmp0, tmp1);
             return _mm_add_ps(tmp0, tmp2);
         }
-        template <class A>
-        XSIMD_INLINE batch<double, A> haddp(batch<double, A> const* row, requires_arch<altivec>) noexcept
-        {
-            return _mm_add_pd(_mm_unpacklo_pd(row[0], row[1]),
-                              _mm_unpackhi_pd(row[0], row[1]));
-        }
+#endif
 
         // incr_if
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
@@ -495,7 +466,6 @@ namespace xsimd
         {
             return self - batch<T, A>(mask.data);
         }
-#endif
 
         // insert
         template <class A, class T, size_t I, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
@@ -522,7 +492,7 @@ namespace xsimd
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<altivec>) noexcept
         {
-            return *(typename batch<T, A>::register_type)mem;
+            return batch<T, A>(*(typename batch<T, A>::register_type)mem);
         }
 
 #if 0
@@ -791,19 +761,17 @@ namespace xsimd
             batch<T, A> acc3 = min(acc2, step3);
             return first(acc3, A {});
         }
+#endif
 
         // rsqrt
         template <class A>
         XSIMD_INLINE batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<altivec>) noexcept
         {
-            return _mm_rsqrt_ps(val);
-        }
-        template <class A>
-        XSIMD_INLINE batch<double, A> rsqrt(batch<double, A> const& val, requires_arch<altivec>) noexcept
-        {
-            return _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(val)));
+            return vec_rsqrt(val);
         }
 
+#if 0
+
         // select
         template <class A>
         XSIMD_INLINE batch<float, A> select(batch_bool<float, A> const& cond, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<altivec>) noexcept
@@ -917,14 +885,14 @@ namespace xsimd
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE void store_aligned(T* mem, batch<T, A> const& self, requires_arch<altivec>) noexcept
         {
-            return vec_st(self, 0, mem);
+            return vec_st(self.data, 0, mem);
         }
 
         // store_unaligned
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE void store_unaligned(T* mem, batch<T, A> const& self, requires_arch<altivec>) noexcept
         {
-            *(typename batch<T, A>::register_type)mem = self;
+            *(typename batch<T, A>::register_type)mem = self.data;
         }
 
         // sub

From c19b1cb2e40d58fbf1b6eb2abe9ad7c81de45d0e Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Sun, 13 Jul 2025 20:24:31 +0200
Subject: [PATCH 15/57] WIP

---
 include/xsimd/arch/xsimd_altivec.hpp | 97 ++++------------------------
 1 file changed, 11 insertions(+), 86 deletions(-)

diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp
index d1f3047ef..0d019150b 100644
--- a/include/xsimd/arch/xsimd_altivec.hpp
+++ b/include/xsimd/arch/xsimd_altivec.hpp
@@ -770,30 +770,18 @@ namespace xsimd
             return vec_rsqrt(val);
         }
 
-#if 0
-
         // select
-        template <class A>
-        XSIMD_INLINE batch<float, A> select(batch_bool<float, A> const& cond, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<altivec>) noexcept
-        {
-            return _mm_or_ps(_mm_and_ps(cond, true_br), _mm_andnot_ps(cond, false_br));
-        }
-
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<altivec>) noexcept
         {
-            return _mm_or_si128(_mm_and_si128(cond, true_br), _mm_andnot_si128(cond, false_br));
+            return vec_sel(true_br, false_br, cond);
         }
-        template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        template <class A, class T, bool... Values, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<altivec>) noexcept
         {
             return select(batch_bool<T, A> { Values... }, true_br, false_br, altivec {});
         }
-        template <class A>
-        XSIMD_INLINE batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<altivec>) noexcept
-        {
-            return _mm_or_pd(_mm_and_pd(cond, true_br), _mm_andnot_pd(cond, false_br));
-        }
+#if 0
 
         // shuffle
         template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3>
@@ -823,18 +811,15 @@ namespace xsimd
                 return _mm_shuffle_pd(y, x, smask);
             return shuffle(x, y, mask, common {});
         }
+#endif
 
         // sqrt
         template <class A>
         XSIMD_INLINE batch<float, A> sqrt(batch<float, A> const& val, requires_arch<altivec>) noexcept
         {
-            return _mm_sqrt_ps(val);
-        }
-        template <class A>
-        XSIMD_INLINE batch<double, A> sqrt(batch<double, A> const& val, requires_arch<altivec>) noexcept
-        {
-            return _mm_sqrt_pd(val);
+            return vec_sqrt(val);
         }
+#if 0
 
         // slide_left
         template <size_t N, class A, class T>
@@ -1014,80 +999,20 @@ namespace xsimd
             transpose(reinterpret_cast<batch<double, A>*>(matrix_begin), reinterpret_cast<batch<double, A>*>(matrix_end), A {});
         }
 
+#endif
         // zip_hi
-        template <class A>
-        XSIMD_INLINE batch<float, A> zip_hi(batch<float, A> const& self, batch<float, A> const& other, requires_arch<altivec>) noexcept
-        {
-            return _mm_unpackhi_ps(self, other);
-        }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-            {
-                return _mm_unpackhi_epi8(self, other);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-            {
-                return _mm_unpackhi_epi16(self, other);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
-            {
-                return _mm_unpackhi_epi32(self, other);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
-            {
-                return _mm_unpackhi_epi64(self, other);
-            }
-            else
-            {
-                assert(false && "unsupported arch/op combination");
-                return {};
-            }
-        }
-        template <class A>
-        XSIMD_INLINE batch<double, A> zip_hi(batch<double, A> const& self, batch<double, A> const& other, requires_arch<altivec>) noexcept
-        {
-            return _mm_unpackhi_pd(self, other);
+            return vec_merge_hi(self, other);
         }
 
         // zip_lo
-        template <class A>
-        XSIMD_INLINE batch<float, A> zip_lo(batch<float, A> const& self, batch<float, A> const& other, requires_arch<altivec>) noexcept
-        {
-            return _mm_unpacklo_ps(self, other);
-        }
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-            {
-                return _mm_unpacklo_epi8(self, other);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
-            {
-                return _mm_unpacklo_epi16(self, other);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
-            {
-                return _mm_unpacklo_epi32(self, other);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
-            {
-                return _mm_unpacklo_epi64(self, other);
-            }
-            else
-            {
-                assert(false && "unsupported arch/op combination");
-                return {};
-            }
+            return vec_mergel(self, other);
         }
-        template <class A>
-        XSIMD_INLINE batch<double, A> zip_lo(batch<double, A> const& self, batch<double, A> const& other, requires_arch<altivec>) noexcept
-        {
-            return _mm_unpacklo_pd(self, other);
-        }
-#endif
     }
 }
 

From f8fa5857960292f9f7cc96f7670bca8a177bafda Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Sun, 13 Jul 2025 20:31:47 +0200
Subject: [PATCH 16/57] WIP

---
 include/xsimd/arch/xsimd_altivec.hpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp
index 0d019150b..cd7d74408 100644
--- a/include/xsimd/arch/xsimd_altivec.hpp
+++ b/include/xsimd/arch/xsimd_altivec.hpp
@@ -492,7 +492,9 @@ namespace xsimd
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<altivec>) noexcept
         {
-            return batch<T, A>(*(typename batch<T, A>::register_type)mem);
+            auto lo = vec_ld(0, mem);
+            auto hi = vec_ld(16, mem);
+            return vec_perm(lo, hi, vec_lvsl(0, mem));
         }
 
 #if 0

From 2de3a6bf3e4ee7de07f2e26ecb6c607f2bab9b6f Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Sun, 13 Jul 2025 20:43:33 +0200
Subject: [PATCH 17/57] store

---
 include/xsimd/arch/xsimd_altivec.hpp | 19 ++++++++++++++++++-
 1 file changed, 18 insertions(+), 1 deletion(-)

diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp
index cd7d74408..3705c2e3a 100644
--- a/include/xsimd/arch/xsimd_altivec.hpp
+++ b/include/xsimd/arch/xsimd_altivec.hpp
@@ -879,7 +879,24 @@ namespace xsimd
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE void store_unaligned(T* mem, batch<T, A> const& self, requires_arch<altivec>) noexcept
         {
-            *(typename batch<T, A>::register_type)mem = self.data;
+            // From: https://stackoverflow.com/questions/35317341/how-to-store-a-vector-to-an-unaligned-location-in-memory-with-altivec
+            // Load the surrounding area
+            auto low = vec_ld(0, dst);
+            auto high = vec_ld(16, dst);
+            // Prepare the constants that we need
+            auto permuteVector = vec_lvsr(0, (int*)mem);
+            auto oxFF = vec_splat_s8(-1);
+            auto ox00 = vec_splat_s8(0);
+            // Make a mask for which parts of the vectors to swap out
+            auto mask = vec_perm(ox00, oxFF, permuteVector);
+            // Right rotate our input data
+            v = vec_perm(self, self, permuteVector);
+            // Insert our data into the low and high vectors
+            low = vec_sel(self, low, mask);
+            high = vec_sel(high, self, mask);
+            // Store the two aligned result vectors
+            vec_st(low, 0, mem);
+            vec_st(high, 16, mem);
         }
 
         // sub

From d66759f7a19b25e5661c3b97e0311f94de1be2a5 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Sun, 13 Jul 2025 20:45:43 +0200
Subject: [PATCH 18/57] ++

---
 include/xsimd/arch/xsimd_altivec.hpp | 54 +++++++---------------------
 1 file changed, 13 insertions(+), 41 deletions(-)

diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp
index 3705c2e3a..1578151f3 100644
--- a/include/xsimd/arch/xsimd_altivec.hpp
+++ b/include/xsimd/arch/xsimd_altivec.hpp
@@ -122,12 +122,12 @@ namespace xsimd
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return vec_and(self, other);
+            return vec_and(self.data, other.data);
         }
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return vec_and(self, other);
+            return vec_and(self.data, other.data);
         }
 
         // bitwise_andnot
@@ -239,6 +239,8 @@ namespace xsimd
             }
         }
 
+#endif
+
         // decr_if
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> decr_if(batch<T, A> const& self, batch_bool<T, A> const& mask, requires_arch<altivec>) noexcept
@@ -247,17 +249,14 @@ namespace xsimd
         }
 
         // div
-        template <class A>
-        XSIMD_INLINE batch<float, A> div(batch<float, A> const& self, batch<float, A> const& other, requires_arch<altivec>) noexcept
-        {
-            return _mm_div_ps(self, other);
-        }
-        template <class A>
-        XSIMD_INLINE batch<double, A> div(batch<double, A> const& self, batch<double, A> const& other, requires_arch<altivec>) noexcept
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<float, A> div(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return _mm_div_pd(self, other);
+            return vec_div(self, other);
         }
 
+#if 0
+
         // fast_cast
         namespace detail
         {
@@ -267,33 +266,6 @@ namespace xsimd
                 return _mm_cvtepi32_ps(self);
             }
 
-            template <class A>
-            XSIMD_INLINE batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<altivec>) noexcept
-            {
-                // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
-                // adapted to altivec
-                __m128i xH = _mm_srli_epi64(x, 32);
-                xH = _mm_or_si128(xH, _mm_castpd_si128(_mm_set1_pd(19342813113834066795298816.))); //  2^84
-                __m128i mask = _mm_setr_epi16(0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000);
-                __m128i xL = _mm_or_si128(_mm_and_si128(mask, x), _mm_andnot_si128(mask, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)))); //  2^52
-                __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.)); //  2^84 + 2^52
-                return _mm_add_pd(f, _mm_castsi128_pd(xL));
-            }
-
-            template <class A>
-            XSIMD_INLINE batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<altivec>) noexcept
-            {
-                // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
-                // adapted to altivec
-                __m128i xH = _mm_srai_epi32(x, 16);
-                xH = _mm_and_si128(xH, _mm_setr_epi16(0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF));
-                xH = _mm_add_epi64(xH, _mm_castpd_si128(_mm_set1_pd(442721857769029238784.))); //  3*2^67
-                __m128i mask = _mm_setr_epi16(0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000);
-                __m128i xL = _mm_or_si128(_mm_and_si128(mask, x), _mm_andnot_si128(mask, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)))); //  2^52
-                __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(442726361368656609280.)); //  3*2^67 + 2^52
-                return _mm_add_pd(f, _mm_castsi128_pd(xL));
-            }
-
             template <class A>
             XSIMD_INLINE batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<altivec>) noexcept
             {
@@ -306,12 +278,12 @@ namespace xsimd
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return vec_cmpeq(self, other);
+            return vec_cmpeq(self.data, other.data);
         }
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return vec_cmpeq(self, other);
+            return vec_cmpeq(self.data, other.data);
         }
 
         // first
@@ -881,8 +853,8 @@ namespace xsimd
         {
             // From: https://stackoverflow.com/questions/35317341/how-to-store-a-vector-to-an-unaligned-location-in-memory-with-altivec
             // Load the surrounding area
-            auto low = vec_ld(0, dst);
-            auto high = vec_ld(16, dst);
+            auto low = vec_ld(0, mem);
+            auto high = vec_ld(16, mem);
             // Prepare the constants that we need
             auto permuteVector = vec_lvsr(0, (int*)mem);
             auto oxFF = vec_splat_s8(-1);

From 540e0d277b9a5118059fd6b037c6305f79ef08ca Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Sun, 13 Jul 2025 21:24:56 +0200
Subject: [PATCH 19/57] WIP

---
 include/xsimd/arch/xsimd_altivec.hpp | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp
index 1578151f3..bbcf7f7bb 100644
--- a/include/xsimd/arch/xsimd_altivec.hpp
+++ b/include/xsimd/arch/xsimd_altivec.hpp
@@ -414,23 +414,23 @@ namespace xsimd
             return vec_cmpgt(self, other);
         }
 
-#if 0
-
         // haddp
         template <class A>
         XSIMD_INLINE batch<float, A> haddp(batch<float, A> const* row, requires_arch<altivec>) noexcept
         {
-            __m128 tmp0 = _mm_unpacklo_ps(row[0], row[1]);
-            __m128 tmp1 = _mm_unpackhi_ps(row[0], row[1]);
-            __m128 tmp2 = _mm_unpackhi_ps(row[2], row[3]);
-            tmp0 = _mm_add_ps(tmp0, tmp1);
-            tmp1 = _mm_unpacklo_ps(row[2], row[3]);
-            tmp1 = _mm_add_ps(tmp1, tmp2);
-            tmp2 = _mm_movehl_ps(tmp1, tmp0);
-            tmp0 = _mm_movelh_ps(tmp0, tmp1);
-            return _mm_add_ps(tmp0, tmp2);
+            auto tmp0 = vec_mergee(row[0], row[1]); // v00 v10 v02 v12
+            auto tmp1 = vec_mergeo(row[0], row[1]); // v01 v11 v03 v13
+            auto tmp4 = vec_add(tmp0, tmp1); // (v00 + v01, v10 + v11, v02 + v03, v12 + v13)
+
+            auto tmp2 = vec_mergee(row[2], row[3]); // v20 v30 v22 v32
+            auto tmp3 = vec_mergeo(row[2], row[3]); // v21 v31 v23 v33
+            auto tmp5 = vec_add(tmp0, tmp1); // (v20 + v21, v30 + v31, v22 + v23, v32 + v33)
+
+            auto tmp6 = vec_permi(tmp4, tmp5, 0x0); // (v00 + v01, v10 + v11, v20 + v21, v30 + v31
+            auto tmp7 = vec_permi(tmp4, tmp5, 0x3); // (v02 + v03, v12 + v13, v12 + v13, v32 + v33)
+
+            return vec_add(tmp6, tmp7);
         }
-#endif
 
         // incr_if
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>

From 68ca9a97d7eb29c19ae2fb83bec1cf18316b17e6 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Sun, 13 Jul 2025 21:32:48 +0200
Subject: [PATCH 20/57] WIP

---
 include/xsimd/arch/xsimd_altivec.hpp | 46 +++++++++++++---------------
 1 file changed, 21 insertions(+), 25 deletions(-)

diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp
index bbcf7f7bb..a98335a94 100644
--- a/include/xsimd/arch/xsimd_altivec.hpp
+++ b/include/xsimd/arch/xsimd_altivec.hpp
@@ -278,12 +278,14 @@ namespace xsimd
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return vec_cmpeq(self.data, other.data);
+            auto res = vec_cmpeq(self.data, other.data);
+            return *reinterpret_cast<typename batch_bool<T, A>::register_type*>(&res);
         }
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return vec_cmpeq(self.data, other.data);
+            auto res = vec_cmpeq(self.data, other.data);
+            return *reinterpret_cast<typename batch_bool<T, A>::register_type*>(&res);
         }
 
         // first
@@ -793,23 +795,21 @@ namespace xsimd
         {
             return vec_sqrt(val);
         }
-#if 0
 
         // slide_left
         template <size_t N, class A, class T>
         XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& x, requires_arch<altivec>) noexcept
         {
-            return _mm_slli_si128(x, N);
+            return vec_sll(x, vec_splat_u8(N));
         }
 
         // slide_right
         template <size_t N, class A, class T>
         XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& x, requires_arch<altivec>) noexcept
         {
-            return _mm_srli_si128(x, N);
+            return vec_srl(x, vec_splat_u8(N));
         }
 
-#endif
         // sadd
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
@@ -819,12 +819,18 @@ namespace xsimd
 
         // set
         template <class A, class T, class... Values>
-        XSIMD_INLINE batch<float, A> set(batch<T, A> const&, requires_arch<altivec>, Values... values) noexcept
+        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<altivec>, Values... values) noexcept
         {
             static_assert(sizeof...(Values) == batch<T, A>::size, "consistent init");
             return typename batch<T, A>::register_type { values... };
         }
 
+        template <class A, class T, class... Values, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<altivec>, Values... values) noexcept
+        {
+            return set(batch<T, A>(), A {}, static_cast<T>(values ? -1LL : 0LL)...).data;
+        }
+
         // ssub
 
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
@@ -851,24 +857,14 @@ namespace xsimd
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE void store_unaligned(T* mem, batch<T, A> const& self, requires_arch<altivec>) noexcept
         {
-            // From: https://stackoverflow.com/questions/35317341/how-to-store-a-vector-to-an-unaligned-location-in-memory-with-altivec
-            // Load the surrounding area
-            auto low = vec_ld(0, mem);
-            auto high = vec_ld(16, mem);
-            // Prepare the constants that we need
-            auto permuteVector = vec_lvsr(0, (int*)mem);
-            auto oxFF = vec_splat_s8(-1);
-            auto ox00 = vec_splat_s8(0);
-            // Make a mask for which parts of the vectors to swap out
-            auto mask = vec_perm(ox00, oxFF, permuteVector);
-            // Right rotate our input data
-            v = vec_perm(self, self, permuteVector);
-            // Insert our data into the low and high vectors
-            low = vec_sel(self, low, mask);
-            high = vec_sel(high, self, mask);
-            // Store the two aligned result vectors
-            vec_st(low, 0, mem);
-            vec_st(high, 16, mem);
+            auto tmp = vec_perm(*reinterpret_cast<const __vector unsigned char*>(&self.data), *reinterpret_cast<const __vector unsigned char*>(&self.data), vec_lvsr(0, (unsigned char*)mem));
+            vec_ste((__vector unsigned char)tmp, 0, (unsigned char*)mem);
+            vec_ste((__vector unsigned short)tmp, 1, (unsigned short*)mem);
+            vec_ste((__vector unsigned int)tmp, 3, (unsigned int*)mem);
+            vec_ste((__vector unsigned int)tmp, 4, (unsigned int*)mem);
+            vec_ste((__vector unsigned int)tmp, 8, (unsigned int*)mem);
+            vec_ste((__vector unsigned int)tmp, 12, (unsigned int*)mem);
+            vec_ste((__vector unsigned short)tmp, 14, (unsigned short*)mem);
         }
 
         // sub

From b5ab4f1a4a5da9090b1a66cfe8dbe11c9a1af5dc Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Sun, 13 Jul 2025 23:12:47 +0200
Subject: [PATCH 21/57] WIP

---
 include/xsimd/arch/xsimd_altivec.hpp | 43 ++++++----------------------
 1 file changed, 8 insertions(+), 35 deletions(-)

diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp
index a98335a94..7f16f092d 100644
--- a/include/xsimd/arch/xsimd_altivec.hpp
+++ b/include/xsimd/arch/xsimd_altivec.hpp
@@ -647,40 +647,18 @@ namespace xsimd
             return vec_re(self);
         }
 
-#if 0
         // reduce_add
-        template <class A>
-        XSIMD_INLINE float reduce_add(batch<float, A> const& self, requires_arch<altivec>) noexcept
-        {
-            __m128 tmp0 = _mm_add_ps(self, _mm_movehl_ps(self, self));
-            __m128 tmp1 = _mm_add_ss(tmp0, _mm_shuffle_ps(tmp0, tmp0, 1));
-            return _mm_cvtss_f32(tmp1);
-        }
-
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<altivec>) noexcept
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
             {
-                __m128i tmp1 = _mm_shuffle_epi32(self, 0x0E);
-                __m128i tmp2 = _mm_add_epi32(self, tmp1);
-                __m128i tmp3 = _mm_shuffle_epi32(tmp2, 0x01);
-                __m128i tmp4 = _mm_add_epi32(tmp2, tmp3);
-                return _mm_cvtsi128_si32(tmp4);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
-            {
-                __m128i tmp1 = _mm_shuffle_epi32(self, 0x0E);
-                __m128i tmp2 = _mm_add_epi64(self, tmp1);
-#if defined(__x86_64__)
-                return _mm_cvtsi128_si64(tmp2);
-#else
-                __m128i m;
-                _mm_storel_epi64(&m, tmp2);
-                int64_t i;
-                std::memcpy(&i, &m, sizeof(i));
-                return i;
-#endif
+                // FIXME: fine an in-order approach
+                auto tmp0 = vec_reve(self); // v3, v2, v1, v0
+                auto tmp1 = vec_add(self.data, tmp0); // v0 + v3, v1 + v2, v2 + v1, v3 + v0
+                auto tmp2 = vec_permi(tmp1, tmp1, 0x3); // v2 + v1, v3 + v0, v2 + v1, v3 + v0
+                auto tmp3 = vec_add(tmp1, tmp2);
+                return vec_extract(tmp3, 0);
             }
             else
             {
@@ -688,12 +666,7 @@ namespace xsimd
             }
         }
 
-        template <class A>
-        XSIMD_INLINE double reduce_add(batch<double, A> const& self, requires_arch<altivec>) noexcept
-        {
-            return _mm_cvtsd_f64(_mm_add_sd(self, _mm_unpackhi_pd(self, self)));
-        }
-
+#if 0
         // reduce_max
         template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
         XSIMD_INLINE T reduce_max(batch<T, A> const& self, requires_arch<altivec>) noexcept

From 1bd2ce7fdbc9ca58b4f3aab1814ecfdf31a1689f Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Sun, 13 Jul 2025 23:31:20 +0200
Subject: [PATCH 22/57] fast-cast

---
 include/xsimd/arch/xsimd_altivec.hpp | 18 +++++++++++++-----
 1 file changed, 13 insertions(+), 5 deletions(-)

diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp
index 7f16f092d..da00dab14 100644
--- a/include/xsimd/arch/xsimd_altivec.hpp
+++ b/include/xsimd/arch/xsimd_altivec.hpp
@@ -255,24 +255,32 @@ namespace xsimd
             return vec_div(self, other);
         }
 
-#if 0
-
         // fast_cast
         namespace detail
         {
             template <class A>
             XSIMD_INLINE batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&, requires_arch<altivec>) noexcept
             {
-                return _mm_cvtepi32_ps(self);
+                return vec_ctf(self.data, 0);
+            }
+            template <class A>
+            XSIMD_INLINE batch<float, A> fast_cast(batch<uint32_t, A> const& self, batch<float, A> const&, requires_arch<altivec>) noexcept
+            {
+                return vec_ctf(self.data, 0);
             }
 
             template <class A>
             XSIMD_INLINE batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<altivec>) noexcept
             {
-                return _mm_cvttps_epi32(self);
+                return vec_cts(self.data, 0);
+            }
+
+            template <class A>
+            XSIMD_INLINE batch<uint32_t, A> fast_cast(batch<float, A> const& self, batch<uint32_t, A> const&, requires_arch<altivec>) noexcept
+            {
+                return vec_ctu(self.data, 0);
             }
         }
-#endif
 
         // eq
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>

From 36cd50cdd57a7771394ca3b604eec6e2741ee375 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Mon, 14 Jul 2025 08:36:31 +0200
Subject: [PATCH 23/57] WIP

---
 include/xsimd/arch/xsimd_altivec.hpp | 15 ++++++++++++---
 1 file changed, 12 insertions(+), 3 deletions(-)

diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp
index da00dab14..b378a9e94 100644
--- a/include/xsimd/arch/xsimd_altivec.hpp
+++ b/include/xsimd/arch/xsimd_altivec.hpp
@@ -78,7 +78,7 @@ namespace xsimd
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return vec_add(self, other);
+            return vec_add(self.data, other.data);
         }
 
         // all
@@ -661,8 +661,7 @@ namespace xsimd
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
             {
-                // FIXME: fine an in-order approach
-                auto tmp0 = vec_reve(self); // v3, v2, v1, v0
+                auto tmp0 = vec_reve(self.data); // v3, v2, v1, v0
                 auto tmp1 = vec_add(self.data, tmp0); // v0 + v3, v1 + v2, v2 + v1, v3 + v0
                 auto tmp2 = vec_permi(tmp1, tmp1, 0x3); // v2 + v1, v3 + v0, v2 + v1, v3 + v0
                 auto tmp3 = vec_add(tmp1, tmp2);
@@ -673,6 +672,16 @@ namespace xsimd
                 return hadd(self, common {});
             }
         }
+        template <class A>
+        XSIMD_INLINE float reduce_add(batch<float, A> const& self, requires_arch<altivec>) noexcept
+        {
+            // FIXME: find an in-order approach
+            auto tmp0 = vec_reve(self.data); // v3, v2, v1, v0
+            auto tmp1 = vec_add(self.data, tmp0); // v0 + v3, v1 + v2, v2 + v1, v3 + v0
+            auto tmp2 = vec_permi(tmp1, tmp1, 0x3); // v2 + v1, v3 + v0, v2 + v1, v3 + v0
+            auto tmp3 = vec_add(tmp1, tmp2);
+            return vec_extract(tmp3, 0);
+        }
 
 #if 0
         // reduce_max

From 37988e7b76a533f38ff7bbd56e8b178f8477aa86 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Mon, 14 Jul 2025 08:47:43 +0200
Subject: [PATCH 24/57] WIP

---
 include/xsimd/arch/xsimd_altivec.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp
index b378a9e94..9312c8602 100644
--- a/include/xsimd/arch/xsimd_altivec.hpp
+++ b/include/xsimd/arch/xsimd_altivec.hpp
@@ -663,7 +663,7 @@ namespace xsimd
             {
                 auto tmp0 = vec_reve(self.data); // v3, v2, v1, v0
                 auto tmp1 = vec_add(self.data, tmp0); // v0 + v3, v1 + v2, v2 + v1, v3 + v0
-                auto tmp2 = vec_permi(tmp1, tmp1, 0x3); // v2 + v1, v3 + v0, v2 + v1, v3 + v0
+                auto tmp2 = vec_mergeh(tmp1, tmp1); // v2 + v1, v2 + v1, v3 + v0, v3 + v0
                 auto tmp3 = vec_add(tmp1, tmp2);
                 return vec_extract(tmp3, 0);
             }
@@ -678,7 +678,7 @@ namespace xsimd
             // FIXME: find an in-order approach
             auto tmp0 = vec_reve(self.data); // v3, v2, v1, v0
             auto tmp1 = vec_add(self.data, tmp0); // v0 + v3, v1 + v2, v2 + v1, v3 + v0
-            auto tmp2 = vec_permi(tmp1, tmp1, 0x3); // v2 + v1, v3 + v0, v2 + v1, v3 + v0
+            auto tmp2 = vec_mergeh(tmp1, tmp1); // v2 + v1, v2 + v1, v3 + v0, v3 + v0
             auto tmp3 = vec_add(tmp1, tmp2);
             return vec_extract(tmp3, 0);
         }

From db1912344187aca4a1b58dc3a83df081ab895089 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Mon, 14 Jul 2025 08:56:41 +0200
Subject: [PATCH 25/57] WIP

---
 include/xsimd/arch/xsimd_altivec.hpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp
index 9312c8602..61cc96d29 100644
--- a/include/xsimd/arch/xsimd_altivec.hpp
+++ b/include/xsimd/arch/xsimd_altivec.hpp
@@ -85,14 +85,14 @@ namespace xsimd
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE bool all(batch_bool<T, A> const& self, requires_arch<altivec>) noexcept
         {
-            return vec_all_ne(self, vec_xor(self, self));
+            return vec_all_ne(self, vec_xor(self.data, self.data));
         }
 
         // any
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE bool any(batch_bool<T, A> const& self, requires_arch<altivec>) noexcept
         {
-            return vec_any_ne(self, vec_xor(self, self));
+            return vec_any_ne(self, vec_xor(self.data, self.data));
         }
 
         // avgr
@@ -250,7 +250,7 @@ namespace xsimd
 
         // div
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE batch<float, A> div(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
+        XSIMD_INLINE batch<T, A> div(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
             return vec_div(self, other);
         }
@@ -740,7 +740,7 @@ namespace xsimd
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<altivec>) noexcept
         {
-            return vec_sel(true_br, false_br, cond);
+            return vec_sel(true_br.data, false_br.data, cond.data);
         }
         template <class A, class T, bool... Values, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<altivec>) noexcept

From 308d4d597cee335ea239e895c3c28af10bfba444 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Mon, 14 Jul 2025 09:28:25 +0200
Subject: [PATCH 26/57] WIP

---
 include/xsimd/arch/xsimd_altivec.hpp | 106 +++++++++++++--------------
 1 file changed, 53 insertions(+), 53 deletions(-)

diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp
index 61cc96d29..8d83a7bd1 100644
--- a/include/xsimd/arch/xsimd_altivec.hpp
+++ b/include/xsimd/arch/xsimd_altivec.hpp
@@ -85,21 +85,21 @@ namespace xsimd
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE bool all(batch_bool<T, A> const& self, requires_arch<altivec>) noexcept
         {
-            return vec_all_ne(self, vec_xor(self.data, self.data));
+            return vec_all_ne(self.data, vec_xor(self.data, self.data));
         }
 
         // any
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE bool any(batch_bool<T, A> const& self, requires_arch<altivec>) noexcept
         {
-            return vec_any_ne(self, vec_xor(self.data, self.data));
+            return vec_any_ne(self.data, vec_xor(self.data, self.data));
         }
 
         // avgr
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> avgr(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return vec_avg(self, other);
+            return vec_avg(self.data, other.data);
         }
 
         // avg
@@ -108,7 +108,7 @@ namespace xsimd
         {
             constexpr auto nbit = 8 * sizeof(T) - 1;
             constexpr auto adj = ((self ^ other) << nbit) >> nbit;
-            return avgr(self, other, A {}) - adj;
+            return avgr(self.data, other.data, A {}) - adj;
         }
 
         // batch_bool_cast
@@ -134,12 +134,12 @@ namespace xsimd
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return vec_nand(self, other);
+            return vec_nand(self.data, other.data);
         }
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         XSIMD_INLINE batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return vec_nand(self, other);
+            return vec_nand(self.data, other.data);
         }
 
         // bitwise_lshift
@@ -148,31 +148,31 @@ namespace xsimd
         {
             using shift_type = as_unsigned_integer_t<T>;
             batch<shift_type, A> shift(static_cast<shift_type>(other));
-            return vec_sl(self, shift);
+            return vec_sl(self.data, shift.data);
         }
 
         // bitwise_not
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<altivec>) noexcept
         {
-            return vec_nor(self, self);
+            return vec_nor(self.data, self.data);
         }
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<altivec>) noexcept
         {
-            return vec_nor(self, self);
+            return vec_nor(self.data, self.data);
         }
 
         // bitwise_or
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return vec_or(self, other);
+            return vec_or(self.data, other.data);
         }
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return vec_or(self, other);
+            return vec_or(self.data, other.data);
         }
 
         // bitwise_rshift
@@ -181,19 +181,19 @@ namespace xsimd
         {
             using shift_type = as_unsigned_integer_t<T>;
             batch<shift_type, A> shift(static_cast<shift_type>(other));
-            return vec_sr(self, shift);
+            return vec_sr(self.data, shift.data);
         }
 
         // bitwise_xor
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return vec_xor(self, other);
+            return vec_xor(self.data, other.data);
         }
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return vec_xor(self, other);
+            return vec_xor(self.data, other.data);
         }
 
         // bitwise_cast
@@ -252,7 +252,7 @@ namespace xsimd
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> div(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return vec_div(self, other);
+            return vec_div(self.data, other.data);
         }
 
         // fast_cast
@@ -300,7 +300,7 @@ namespace xsimd
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE T first(batch<T, A> const& self, requires_arch<altivec>) noexcept
         {
-            return vec_extract(self, 0);
+            return vec_extract(self.data, 0);
         }
 #if 0
 
@@ -404,36 +404,36 @@ namespace xsimd
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return vec_cmpge(self, other);
+            return vec_cmpge(self.data, other.data);
         }
-        template <class A>
-        XSIMD_INLINE batch_bool<double, A> ge(batch<double, A> const& self, batch<double, A> const& other, requires_arch<altivec>) noexcept
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return vec_cmpge(self, other);
+            return vec_cmpge(self.data, other.data);
         }
 
         // gt
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return vec_cmpgt(self, other);
+            return vec_cmpgt(self.data, other.data);
         }
-        template <class A>
-        XSIMD_INLINE batch_bool<double, A> gt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<altivec>) noexcept
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return vec_cmpgt(self, other);
+            return vec_cmpgt(self.data, other.data);
         }
 
         // haddp
         template <class A>
         XSIMD_INLINE batch<float, A> haddp(batch<float, A> const* row, requires_arch<altivec>) noexcept
         {
-            auto tmp0 = vec_mergee(row[0], row[1]); // v00 v10 v02 v12
-            auto tmp1 = vec_mergeo(row[0], row[1]); // v01 v11 v03 v13
+            auto tmp0 = vec_mergee(row[0].data, row[1].data); // v00 v10 v02 v12
+            auto tmp1 = vec_mergeo(row[0].data, row[1].data); // v01 v11 v03 v13
             auto tmp4 = vec_add(tmp0, tmp1); // (v00 + v01, v10 + v11, v02 + v03, v12 + v13)
 
-            auto tmp2 = vec_mergee(row[2], row[3]); // v20 v30 v22 v32
-            auto tmp3 = vec_mergeo(row[2], row[3]); // v21 v31 v23 v33
+            auto tmp2 = vec_mergee(row[2].data, row[3].data); // v20 v30 v22 v32
+            auto tmp3 = vec_mergeo(row[2].data, row[3].data); // v21 v31 v23 v33
             auto tmp5 = vec_add(tmp0, tmp1); // (v20 + v21, v30 + v31, v22 + v23, v32 + v33)
 
             auto tmp6 = vec_permi(tmp4, tmp5, 0x0); // (v00 + v01, v10 + v11, v20 + v21, v30 + v31
@@ -453,14 +453,14 @@ namespace xsimd
         template <class A, class T, size_t I, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I> pos, requires_arch<altivec>) noexcept
         {
-            return vec_insert(val, self, pos);
+            return vec_insert(val, self.data, pos);
         }
 
         // isnan
         template <class A>
         XSIMD_INLINE batch_bool<float, A> isnan(batch<float, A> const& self, requires_arch<altivec>) noexcept
         {
-            return ~vec_cmpeq(self, self);
+            return ~vec_cmpeq(self.data, self.data);
         }
 
         // load_aligned
@@ -501,22 +501,22 @@ namespace xsimd
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return vec_cmple(self, other);
+            return vec_cmple(self.data, other.data);
         }
-        template <class A>
-        XSIMD_INLINE batch_bool<double, A> le(batch<double, A> const& self, batch<double, A> const& other, requires_arch<altivec>) noexcept
+        template <class A, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return vec_cmple(self, other);
+            return vec_cmple(self.data, other.data);
         }
 
         // lt
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return vec_cmplt(self, other);
+            return vec_cmplt(self.data, other.data);
         }
-        template <class A>
-        XSIMD_INLINE batch_bool<double, A> lt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<altivec>) noexcept
+        template <class A, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
             return vec_cmplt(self, other);
         }
@@ -601,21 +601,21 @@ namespace xsimd
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return vec_max(self, other);
+            return vec_max(self.data, other.data);
         }
 
         // min
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return vec_min(self, other);
+            return vec_min(self.data, other.data);
         }
 
         // mul
         template <class A, class T, typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return vec_mul(self, other);
+            return vec_mul(self.data, other.data);
         }
 #if 0
 
@@ -632,19 +632,19 @@ namespace xsimd
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> neg(batch<T, A> const& self, requires_arch<altivec>) noexcept
         {
-            return vec_neg(self);
+            return vec_neg(self.data);
         }
 
         // neq
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return ~vec_cmpeq(self, other);
+            return ~vec_cmpeq(self.data, other.data);
         }
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch_bool<T, A> neq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return ~vec_cmpeq(self, other);
+            return ~vec_cmpeq(self.data, other.data);
         }
 
         // reciprocal
@@ -652,7 +652,7 @@ namespace xsimd
         XSIMD_INLINE batch<float, A> reciprocal(batch<float, A> const& self,
                                                 kernel::requires_arch<altivec>)
         {
-            return vec_re(self);
+            return vec_re(self.data);
         }
 
         // reduce_add
@@ -733,7 +733,7 @@ namespace xsimd
         template <class A>
         XSIMD_INLINE batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<altivec>) noexcept
         {
-            return vec_rsqrt(val);
+            return vec_rsqrt(val.data);
         }
 
         // select
@@ -783,28 +783,28 @@ namespace xsimd
         template <class A>
         XSIMD_INLINE batch<float, A> sqrt(batch<float, A> const& val, requires_arch<altivec>) noexcept
         {
-            return vec_sqrt(val);
+            return vec_sqrt(val.data);
         }
 
         // slide_left
         template <size_t N, class A, class T>
         XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& x, requires_arch<altivec>) noexcept
         {
-            return vec_sll(x, vec_splat_u8(N));
+            return vec_sll(x.data, vec_splat_u8(N));
         }
 
         // slide_right
         template <size_t N, class A, class T>
         XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& x, requires_arch<altivec>) noexcept
         {
-            return vec_srl(x, vec_splat_u8(N));
+            return vec_srl(x.data, vec_splat_u8(N));
         }
 
         // sadd
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return vec_adds(self, other);
+            return vec_adds(self.data, other.data);
         }
 
         // set
@@ -828,7 +828,7 @@ namespace xsimd
         {
             XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
             {
-                return vec_subs(self, other);
+                return vec_subs(self.data, other.data);
             }
             else
             {
@@ -861,7 +861,7 @@ namespace xsimd
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return vec_sub(self, other);
+            return vec_sub(self.data, other.data);
         }
 
 #if 0
@@ -981,14 +981,14 @@ namespace xsimd
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return vec_merge_hi(self, other);
+            return vec_merge_hi(self.data, other.data);
         }
 
         // zip_lo
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return vec_mergel(self, other);
+            return vec_mergel(self.data, other.data);
         }
     }
 }

From 8c7d552c8fef5c3dba79ade14dcba683abc41c2c Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Mon, 14 Jul 2025 10:12:16 +0200
Subject: [PATCH 27/57] WIP

---
 include/xsimd/arch/xsimd_altivec.hpp | 20 --------------------
 1 file changed, 20 deletions(-)

diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp
index 8d83a7bd1..97d12586c 100644
--- a/include/xsimd/arch/xsimd_altivec.hpp
+++ b/include/xsimd/arch/xsimd_altivec.hpp
@@ -406,11 +406,6 @@ namespace xsimd
         {
             return vec_cmpge(self.data, other.data);
         }
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
-        {
-            return vec_cmpge(self.data, other.data);
-        }
 
         // gt
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
@@ -418,11 +413,6 @@ namespace xsimd
         {
             return vec_cmpgt(self.data, other.data);
         }
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
-        {
-            return vec_cmpgt(self.data, other.data);
-        }
 
         // haddp
         template <class A>
@@ -503,11 +493,6 @@ namespace xsimd
         {
             return vec_cmple(self.data, other.data);
         }
-        template <class A, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
-        {
-            return vec_cmple(self.data, other.data);
-        }
 
         // lt
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
@@ -515,11 +500,6 @@ namespace xsimd
         {
             return vec_cmplt(self.data, other.data);
         }
-        template <class A, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
-        {
-            return vec_cmplt(self, other);
-        }
 
 #if 0
 

From ca1fd743dd5d59dbeedc3526677e68bd2aaa8d1b Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Mon, 14 Jul 2025 11:07:51 +0200
Subject: [PATCH 28/57] WIP

---
 include/xsimd/arch/xsimd_altivec.hpp          |  2 +-
 .../xsimd/types/xsimd_altivec_register.hpp    | 27 +++++++++++++------
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp
index 97d12586c..94fc01e0f 100644
--- a/include/xsimd/arch/xsimd_altivec.hpp
+++ b/include/xsimd/arch/xsimd_altivec.hpp
@@ -252,7 +252,7 @@ namespace xsimd
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> div(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return vec_div(self.data, other.data);
+            return vec_mul(self.data, vec_re(other.data));
         }
 
         // fast_cast
diff --git a/include/xsimd/types/xsimd_altivec_register.hpp b/include/xsimd/types/xsimd_altivec_register.hpp
index 52f896bf2..cf15a3f9f 100644
--- a/include/xsimd/types/xsimd_altivec_register.hpp
+++ b/include/xsimd/types/xsimd_altivec_register.hpp
@@ -38,14 +38,25 @@ namespace xsimd
 #if XSIMD_WITH_ALTIVEC
     namespace types
     {
-        XSIMD_DECLARE_SIMD_REGISTER(signed char, altivec, __vector signed char);
-        XSIMD_DECLARE_SIMD_REGISTER(unsigned char, altivec, __vector unsigned char);
-        XSIMD_DECLARE_SIMD_REGISTER(char, altivec, __vector char);
-        XSIMD_DECLARE_SIMD_REGISTER(unsigned short, altivec, __vector unsigned short);
-        XSIMD_DECLARE_SIMD_REGISTER(short, altivec, __vector short);
-        XSIMD_DECLARE_SIMD_REGISTER(unsigned int, altivec, __vector unsigned int);
-        XSIMD_DECLARE_SIMD_REGISTER(int, altivec, __vector int);
-        XSIMD_DECLARE_SIMD_REGISTER(float, altivec, __vector float);
+
+#define XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER(T, Tb) \
+    template <>                                         \
+    struct get_bool_simd_register<T, altivec>           \
+    {                                                   \
+        using type = __vector __bool Tb;                \
+    };                                                  \
+    XSIMD_DECLARE_SIMD_REGISTER(T, altivec, __vector T)
+
+        XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER(signed char, char);
+        XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER(unsigned char, char);
+        XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER(char, char);
+        XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER(unsigned short, short);
+        XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER(short, short);
+        XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER(unsigned int, int);
+        XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER(int, int);
+        XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER(float, float);
+
+#undef XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER
     }
 #endif
 }

From 8602e056dcff88e1b1d9b5db47f045c5a4239d45 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Tue, 15 Jul 2025 01:14:24 +0200
Subject: [PATCH 29/57] WIP

---
 include/xsimd/arch/xsimd_altivec.hpp          |  3 ++-
 .../xsimd/types/xsimd_altivec_register.hpp    | 24 +++++++++++++------
 2 files changed, 19 insertions(+), 8 deletions(-)

diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp
index 94fc01e0f..9310838f2 100644
--- a/include/xsimd/arch/xsimd_altivec.hpp
+++ b/include/xsimd/arch/xsimd_altivec.hpp
@@ -798,7 +798,8 @@ namespace xsimd
         template <class A, class T, class... Values, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<altivec>, Values... values) noexcept
         {
-            return set(batch<T, A>(), A {}, static_cast<T>(values ? -1LL : 0LL)...).data;
+            static_assert(sizeof...(Values) == batch_bool<T, A>::size, "consistent init");
+            return typename batch_bool<T, A>::register_type { static_cast<decltype(std::declval<typename batch_bool<T, A>::register_type>()[0])>(values ? -1LL : 0LL)... };
         }
 
         // ssub
diff --git a/include/xsimd/types/xsimd_altivec_register.hpp b/include/xsimd/types/xsimd_altivec_register.hpp
index cf15a3f9f..2ed7d89ee 100644
--- a/include/xsimd/types/xsimd_altivec_register.hpp
+++ b/include/xsimd/types/xsimd_altivec_register.hpp
@@ -39,12 +39,22 @@ namespace xsimd
     namespace types
     {
 
-#define XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER(T, Tb) \
-    template <>                                         \
-    struct get_bool_simd_register<T, altivec>           \
-    {                                                   \
-        using type = __vector __bool Tb;                \
-    };                                                  \
+#define XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER(T, Tb)              \
+    template <>                                                      \
+    struct get_bool_simd_register<T, altivec>                        \
+    {                                                                \
+        struct type                                                  \
+        {                                                            \
+            using register_type = __vector __bool Tb;                \
+            register_type data;                                      \
+            type() = default;                                        \
+            type(register_type r)                                    \
+                : data(r)                                            \
+            {                                                        \
+            }                                                        \
+            operator register_type() const noexcept { return data; } \
+        };                                                           \
+    };                                                               \
     XSIMD_DECLARE_SIMD_REGISTER(T, altivec, __vector T)
 
         XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER(signed char, char);
@@ -54,7 +64,7 @@ namespace xsimd
         XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER(short, short);
         XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER(unsigned int, int);
         XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER(int, int);
-        XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER(float, float);
+        XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER(float, int);
 
 #undef XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER
     }

From 4f4092d1954707b145635797e355eb0ad73ad91c Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Tue, 15 Jul 2025 12:18:21 +0200
Subject: [PATCH 30/57] WIP

---
 include/xsimd/config/xsimd_inline.hpp | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/include/xsimd/config/xsimd_inline.hpp b/include/xsimd/config/xsimd_inline.hpp
index 88e9cbcd0..33dba1033 100644
--- a/include/xsimd/config/xsimd_inline.hpp
+++ b/include/xsimd/config/xsimd_inline.hpp
@@ -12,6 +12,10 @@
 #ifndef XSIMD_INLINE_HPP
 #define XSIMD_INLINE_HPP
 
+#if defined(__VEC__)
+#define XSIMD_INLINE inline
+#else
+
 #if defined(__GNUC__)
 #define XSIMD_INLINE inline __attribute__((always_inline))
 #elif defined(_MSC_VER)
@@ -21,3 +25,5 @@
 #endif
 
 #endif
+
+#endif

From b7a286e62e2b041dec8dfb96d432ae761c3fbf3d Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Tue, 15 Jul 2025 12:33:38 +0200
Subject: [PATCH 31/57] WIP

---
 include/xsimd/arch/xsimd_altivec.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp
index 9310838f2..649c79b35 100644
--- a/include/xsimd/arch/xsimd_altivec.hpp
+++ b/include/xsimd/arch/xsimd_altivec.hpp
@@ -249,8 +249,8 @@ namespace xsimd
         }
 
         // div
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        XSIMD_INLINE batch<T, A> div(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
+        template <class A>
+        XSIMD_INLINE batch<float, A> div(batch<float, A> const& self, batch<float, A> const& other, requires_arch<altivec>) noexcept
         {
             return vec_mul(self.data, vec_re(other.data));
         }

From e68ad27d7dea75b8e8dad13c280b2dd534f92f29 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Tue, 15 Jul 2025 12:36:58 +0200
Subject: [PATCH 32/57] gcc ver

---
 .github/workflows/cross-ppc.yml      |  4 ++--
 include/xsimd/arch/xsimd_altivec.hpp | 17 +++++++++--------
 2 files changed, 11 insertions(+), 10 deletions(-)

diff --git a/.github/workflows/cross-ppc.yml b/.github/workflows/cross-ppc.yml
index ec7a11f04..f1617b88b 100644
--- a/.github/workflows/cross-ppc.yml
+++ b/.github/workflows/cross-ppc.yml
@@ -10,9 +10,9 @@ jobs:
     strategy:
       matrix:
         target:
-          - { platform: 'ppc',     dir: 'powerpc-linux-gnu',   flags: '-maltivec', full: 'OFF' }
+          - { platform: 'ppc',     dir: 'powerpc-linux-gnu',   flags: '-maltivec -mvsx', full: 'OFF' }
         sys:
-          - { compiler: 'gcc',   version: '10' }
+          - { compiler: 'gcc',   version: '12' }
     steps:
     - name: Setup compiler
       if: ${{ matrix.sys.compiler == 'gcc' }}
diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp
index 649c79b35..509c76646 100644
--- a/include/xsimd/arch/xsimd_altivec.hpp
+++ b/include/xsimd/arch/xsimd_altivec.hpp
@@ -71,7 +71,7 @@ namespace xsimd
         template <class A>
         XSIMD_INLINE batch<float, A> abs(batch<float, A> const& self, requires_arch<altivec>) noexcept
         {
-            return vec_abs(self);
+            return vec_abs(self.data);
         }
 
         // add
@@ -401,14 +401,14 @@ namespace xsimd
         }
 #endif
         // ge
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
             return vec_cmpge(self.data, other.data);
         }
 
         // gt
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
             return vec_cmpgt(self.data, other.data);
@@ -440,7 +440,7 @@ namespace xsimd
         }
 
         // insert
-        template <class A, class T, size_t I, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        template <class A, class T, size_t I, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I> pos, requires_arch<altivec>) noexcept
         {
             return vec_insert(val, self.data, pos);
@@ -488,14 +488,14 @@ namespace xsimd
 #endif
 
         // le
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
             return vec_cmple(self.data, other.data);
         }
 
         // lt
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
             return vec_cmplt(self.data, other.data);
@@ -592,10 +592,11 @@ namespace xsimd
         }
 
         // mul
-        template <class A, class T, typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return vec_mul(self.data, other.data);
+            return self.data * other.data;
+            // return vec_mul(self.data, other.data);
         }
 #if 0
 

From 1040fef6482160b84c86f3128bf79d9860d8f207 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Wed, 16 Jul 2025 09:46:11 +0200
Subject: [PATCH 33/57] WIP

---
 include/xsimd/arch/xsimd_altivec.hpp | 13 ++++++++++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp
index 509c76646..f3451681a 100644
--- a/include/xsimd/arch/xsimd_altivec.hpp
+++ b/include/xsimd/arch/xsimd_altivec.hpp
@@ -66,6 +66,8 @@ namespace xsimd
         template <class A, class T>
         XSIMD_INLINE batch<T, A> avgr(batch<T, A> const&, batch<T, A> const&, requires_arch<common>) noexcept;
 #endif
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> ssub(batch<T, A> const&, batch<T, A> const&, requires_arch<common>) noexcept;
 
         // abs
         template <class A>
@@ -103,13 +105,18 @@ namespace xsimd
         }
 
         // avg
-        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> avg(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
             constexpr auto nbit = 8 * sizeof(T) - 1;
-            constexpr auto adj = ((self ^ other) << nbit) >> nbit;
+            auto adj = ((self ^ other) << nbit) >> nbit;
             return avgr(self.data, other.data, A {}) - adj;
         }
+        template <class A>
+        XSIMD_INLINE batch<float, A> avg(batch<float, A> const& self, batch<float, A> const& other, requires_arch<altivec>) noexcept
+        {
+            return vec_avg(self.data, other.data);
+        }
 
         // batch_bool_cast
         template <class A, class T_out, class T_in>
@@ -613,7 +620,7 @@ namespace xsimd
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> neg(batch<T, A> const& self, requires_arch<altivec>) noexcept
         {
-            return vec_neg(self.data);
+            return -(self.data);
         }
 
         // neq

From 1c231958841e1fd522f6a7e8595a7e2341ae3da2 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Wed, 16 Jul 2025 11:37:00 +0200
Subject: [PATCH 34/57] WIP

---
 include/xsimd/arch/xsimd_altivec.hpp | 20 +++-----------------
 1 file changed, 3 insertions(+), 17 deletions(-)

diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp
index f3451681a..0c407db5f 100644
--- a/include/xsimd/arch/xsimd_altivec.hpp
+++ b/include/xsimd/arch/xsimd_altivec.hpp
@@ -66,8 +66,6 @@ namespace xsimd
         template <class A, class T>
         XSIMD_INLINE batch<T, A> avgr(batch<T, A> const&, batch<T, A> const&, requires_arch<common>) noexcept;
 #endif
-        template <class A, class T>
-        XSIMD_INLINE batch<T, A> ssub(batch<T, A> const&, batch<T, A> const&, requires_arch<common>) noexcept;
 
         // abs
         template <class A>
@@ -98,7 +96,7 @@ namespace xsimd
         }
 
         // avgr
-        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> avgr(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
             return vec_avg(self.data, other.data);
@@ -112,11 +110,6 @@ namespace xsimd
             auto adj = ((self ^ other) << nbit) >> nbit;
             return avgr(self.data, other.data, A {}) - adj;
         }
-        template <class A>
-        XSIMD_INLINE batch<float, A> avg(batch<float, A> const& self, batch<float, A> const& other, requires_arch<altivec>) noexcept
-        {
-            return vec_avg(self.data, other.data);
-        }
 
         // batch_bool_cast
         template <class A, class T_out, class T_in>
@@ -812,17 +805,10 @@ namespace xsimd
 
         // ssub
 
-        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value && sizeof(T) == 1, void>::type>
         XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
-            {
-                return vec_subs(self.data, other.data);
-            }
-            else
-            {
-                return ssub(self, other, common {});
-            }
+            return vec_subs(self.data, other.data);
         }
 
         // store_aligned

From 62b9257abcca955356e6dc9737ed63f74285a5b8 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Wed, 16 Jul 2025 11:48:29 +0200
Subject: [PATCH 35/57] WIP

---
 include/xsimd/arch/xsimd_altivec.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp
index 0c407db5f..4d7a1e76c 100644
--- a/include/xsimd/arch/xsimd_altivec.hpp
+++ b/include/xsimd/arch/xsimd_altivec.hpp
@@ -108,7 +108,7 @@ namespace xsimd
         {
             constexpr auto nbit = 8 * sizeof(T) - 1;
             auto adj = ((self ^ other) << nbit) >> nbit;
-            return avgr(self.data, other.data, A {}) - adj;
+            return avgr(self, other, A {}) - adj;
         }
 
         // batch_bool_cast
@@ -782,7 +782,7 @@ namespace xsimd
         }
 
         // sadd
-        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
             return vec_adds(self.data, other.data);

From 37548ca85aa1897412cb479c058f5c40c3b55e19 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Wed, 16 Jul 2025 19:04:21 +0200
Subject: [PATCH 36/57] WIP

---
 include/xsimd/arch/xsimd_altivec.hpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp
index 4d7a1e76c..e84aa25f2 100644
--- a/include/xsimd/arch/xsimd_altivec.hpp
+++ b/include/xsimd/arch/xsimd_altivec.hpp
@@ -134,12 +134,12 @@ namespace xsimd
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return vec_nand(self.data, other.data);
+            return self.data & ~other.data;
         }
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return vec_nand(self.data, other.data);
+            return self.data & ~other.data;
         }
 
         // bitwise_lshift
@@ -191,7 +191,7 @@ namespace xsimd
             return vec_xor(self.data, other.data);
         }
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
-        XSIMD_INLINE batch<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<altivec>) noexcept
+        XSIMD_INLINE batch_bool<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<altivec>) noexcept
         {
             return vec_xor(self.data, other.data);
         }

From 7e1d26a698ec775423bdfab0b68d8b6f93490fc7 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Wed, 16 Jul 2025 23:36:21 +0200
Subject: [PATCH 37/57] double

---
 test/test_batch_cast.cpp | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/test/test_batch_cast.cpp b/test/test_batch_cast.cpp
index 8a638ed24..5d84176d3 100644
--- a/test/test_batch_cast.cpp
+++ b/test/test_batch_cast.cpp
@@ -70,15 +70,18 @@ struct batch_cast_test
     using uint16_batch = xsimd::batch<uint16_t>;
     using int32_batch = xsimd::batch<int32_t>;
     using uint32_batch = xsimd::batch<uint32_t>;
+    using float_batch = xsimd::batch<float>;
+#ifndef XSIMD_WITH_ALTIVEC
     using int64_batch = xsimd::batch<int64_t>;
     using uint64_batch = xsimd::batch<uint64_t>;
-    using float_batch = xsimd::batch<float>;
     using double_batch = xsimd::batch<double>;
+#endif
 
     std::vector<uint64_t> int_test_values;
-    std::vector<float> float_test_values;
     std::vector<double> double_test_values;
 
+    std::vector<float> float_test_values;
+
     batch_cast_test()
     {
         int_test_values = {
@@ -182,12 +185,14 @@ struct batch_cast_test
             test_cast_impl<uint32_batch, uint32_batch>(test_value, "batch cast uint32 -> uint32");
             test_cast_impl<uint32_batch, float_batch>(test_value, "batch cast uint32 -> float");
 
+#ifndef XSIMD_WITH_ALTIVEC
             test_cast_impl<int64_batch, int64_batch>(test_value, "batch cast int64 -> int64");
             test_cast_impl<int64_batch, uint64_batch>(test_value, "batch cast int64 -> uint64");
             test_cast_impl<int64_batch, double_batch>(test_value, "batch cast int64 -> double");
             test_cast_impl<uint64_batch, int64_batch>(test_value, "batch cast uint64 -> int64");
             test_cast_impl<uint64_batch, uint64_batch>(test_value, "batch cast uint64 -> uint64");
             test_cast_impl<uint64_batch, double_batch>(test_value, "batch cast uint64 -> double");
+#endif
         }
 
         for (const auto& test_value : float_test_values)
@@ -197,12 +202,14 @@ struct batch_cast_test
             test_cast_impl<float_batch, float_batch>(test_value, "batch cast float -> float");
         }
 
+#ifndef XSIMD_WITH_ALTIVEC
         for (const auto& test_value : double_test_values)
         {
             test_cast_impl<double_batch, int64_batch>(test_value, "batch cast double -> int64");
             test_cast_impl<double_batch, uint64_batch>(test_value, "batch cast double -> uint64");
             test_cast_impl<double_batch, double_batch>(test_value, "batch cast double -> double");
         }
+#endif
     }
 
 #if 0 && XSIMD_X86_INSTR_SET >= XSIMD_X86_AVX_VERSION

From 35aa9e7733758c20f552d155d2daf65dbcb5a8d4 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Wed, 16 Jul 2025 23:59:07 +0200
Subject: [PATCH 38/57] WIP

---
 include/xsimd/arch/xsimd_altivec.hpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp
index e84aa25f2..86373b072 100644
--- a/include/xsimd/arch/xsimd_altivec.hpp
+++ b/include/xsimd/arch/xsimd_altivec.hpp
@@ -115,7 +115,7 @@ namespace xsimd
         template <class A, class T_out, class T_in>
         XSIMD_INLINE batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<altivec>) noexcept
         {
-            return { bitwise_cast<T_out>(batch<T_in, A>(self.data)).data };
+            return (typename batch_bool<T_out, A>::register_type)self.data;
         }
 
         // bitwise_and

From abfece217311cb39f267a509b1896c575b27b012 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Thu, 17 Jul 2025 00:14:56 +0200
Subject: [PATCH 39/57] WIP

---
 .github/workflows/cross-ppc.yml      |  2 +-
 include/xsimd/arch/xsimd_altivec.hpp | 19 ++-------
 test/CMakeLists.txt                  | 58 ++++++++++++++--------------
 3 files changed, 33 insertions(+), 46 deletions(-)

diff --git a/.github/workflows/cross-ppc.yml b/.github/workflows/cross-ppc.yml
index f1617b88b..70695b0c6 100644
--- a/.github/workflows/cross-ppc.yml
+++ b/.github/workflows/cross-ppc.yml
@@ -39,5 +39,5 @@ jobs:
       run: cmake --build _build --verbose -j1
     - name: Testing xsimd
       run: |
-        qemu-${{ matrix.target.platform }} -cpu 7400 -L /usr/${{ matrix.target.dir}}/ ./test/test_xsimd
+        qemu-${{ matrix.target.platform }} -cpu 7457 -L /usr/${{ matrix.target.dir}}/ ./test/test_xsimd
       working-directory: ${{ github.workspace }}/_build
diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp
index 86373b072..27d54d9df 100644
--- a/include/xsimd/arch/xsimd_altivec.hpp
+++ b/include/xsimd/arch/xsimd_altivec.hpp
@@ -209,7 +209,6 @@ namespace xsimd
         {
             return vec_splats(val);
         }
-#if 0
 
         // store_complex
         namespace detail
@@ -219,28 +218,16 @@ namespace xsimd
             template <class A>
             XSIMD_INLINE batch<float, A> complex_low(batch<std::complex<float>, A> const& self, requires_arch<altivec>) noexcept
             {
-                return _mm_unpacklo_ps(self.real(), self.imag());
+                return vec_mergel(self.real().data, self.imag().data);
             }
             // complex_high
             template <class A>
             XSIMD_INLINE batch<float, A> complex_high(batch<std::complex<float>, A> const& self, requires_arch<altivec>) noexcept
             {
-                return _mm_unpackhi_ps(self.real(), self.imag());
-            }
-            template <class A>
-            XSIMD_INLINE batch<double, A> complex_low(batch<std::complex<double>, A> const& self, requires_arch<altivec>) noexcept
-            {
-                return _mm_unpacklo_pd(self.real(), self.imag());
-            }
-            template <class A>
-            XSIMD_INLINE batch<double, A> complex_high(batch<std::complex<double>, A> const& self, requires_arch<altivec>) noexcept
-            {
-                return _mm_unpackhi_pd(self.real(), self.imag());
+                return vec_mergeh(self.real().data, self.imag().data);
             }
         }
 
-#endif
-
         // decr_if
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> decr_if(batch<T, A> const& self, batch_bool<T, A> const& mask, requires_arch<altivec>) noexcept
@@ -956,7 +943,7 @@ namespace xsimd
         template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
-            return vec_merge_hi(self.data, other.data);
+            return vec_mergeh(self.data, other.data);
         }
 
         // zip_lo
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index e6bad7999..95ab8d2af 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -131,39 +131,39 @@ set(XSIMD_TESTS
     main.cpp
     test_api.cpp
     test_arch.cpp
-    test_basic_math.cpp
+    #test_basic_math.cpp
     test_batch.cpp
     test_batch_bool.cpp
     test_batch_cast.cpp
-    test_batch_complex.cpp
-    test_batch_float.cpp
-    test_batch_int.cpp
-    test_bitwise_cast.cpp
-    test_batch_constant.cpp
-    test_batch_manip.cpp
-    test_complex_exponential.cpp
-    test_complex_hyperbolic.cpp
-    test_complex_power.cpp
-    test_complex_trigonometric.cpp
-    test_conversion.cpp
-    test_custom_default_arch.cpp
-    test_error_gamma.cpp
-    test_explicit_batch_instantiation.cpp
-    test_exponential.cpp
-    test_extract_pair.cpp
-    test_fp_manipulation.cpp
-    test_hyperbolic.cpp
-    test_load_store.cpp
-    test_memory.cpp
-    test_poly_evaluation.cpp
-    test_power.cpp
-    test_rounding.cpp
-    test_select.cpp
-    test_shuffle.cpp
+    #    test_batch_complex.cpp
+    #    test_batch_float.cpp
+    #    test_batch_int.cpp
+    #    test_bitwise_cast.cpp
+    #    test_batch_constant.cpp
+    #    test_batch_manip.cpp
+    #    test_complex_exponential.cpp
+    #    test_complex_hyperbolic.cpp
+    #    test_complex_power.cpp
+    #    test_complex_trigonometric.cpp
+    #    test_conversion.cpp
+    #    test_custom_default_arch.cpp
+    #    test_error_gamma.cpp
+    #    test_explicit_batch_instantiation.cpp
+    #    test_exponential.cpp
+    #    test_extract_pair.cpp
+    #    test_fp_manipulation.cpp
+    #    test_hyperbolic.cpp
+    #    test_load_store.cpp
+    #    test_memory.cpp
+    #    test_poly_evaluation.cpp
+    #    test_power.cpp
+    #    test_rounding.cpp
+    #    test_select.cpp
+    #    test_shuffle.cpp
     test_sum.cpp
-    test_traits.cpp
-    test_trigonometric.cpp
-    test_xsimd_api.cpp
+    #    test_traits.cpp
+    #    test_trigonometric.cpp
+    #    test_xsimd_api.cpp
     test_utils.hpp
 )
 

From 4e8638a3b8fbb0fff534ede003e881d75489eda6 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Thu, 17 Jul 2025 10:46:22 +0200
Subject: [PATCH 40/57] WIP

---
 test/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 95ab8d2af..8bb46afda 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -135,7 +135,7 @@ set(XSIMD_TESTS
     test_batch.cpp
     test_batch_bool.cpp
     test_batch_cast.cpp
-    #    test_batch_complex.cpp
+    test_batch_complex.cpp
     #    test_batch_float.cpp
     #    test_batch_int.cpp
     #    test_bitwise_cast.cpp

From bdb295bf7be44ff155c3d3ea3bcf7ae7f35ee7f7 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Thu, 17 Jul 2025 10:57:41 +0200
Subject: [PATCH 41/57] WIP

---
 .github/toolchains/gcc-powerpc-linux-gnu.cmake   |  5 -----
 .github/toolchains/gcc-powerpc64-linux-gnu.cmake |  5 +++++
 .github/workflows/cross-ppc.yml                  |  4 ++--
 include/xsimd/arch/xsimd_altivec.hpp             | 10 +---------
 test/CMakeLists.txt                              |  2 +-
 5 files changed, 9 insertions(+), 17 deletions(-)
 delete mode 100644 .github/toolchains/gcc-powerpc-linux-gnu.cmake
 create mode 100644 .github/toolchains/gcc-powerpc64-linux-gnu.cmake

diff --git a/.github/toolchains/gcc-powerpc-linux-gnu.cmake b/.github/toolchains/gcc-powerpc-linux-gnu.cmake
deleted file mode 100644
index a318f6412..000000000
--- a/.github/toolchains/gcc-powerpc-linux-gnu.cmake
+++ /dev/null
@@ -1,5 +0,0 @@
-set(CMAKE_SYSTEM_PROCESSOR powerpc)
-set(triple powerpc-linux-gnu)
-
-include(${CMAKE_CURRENT_LIST_DIR}/gcc.cmake)
-
diff --git a/.github/toolchains/gcc-powerpc64-linux-gnu.cmake b/.github/toolchains/gcc-powerpc64-linux-gnu.cmake
new file mode 100644
index 000000000..5dd97d6c6
--- /dev/null
+++ b/.github/toolchains/gcc-powerpc64-linux-gnu.cmake
@@ -0,0 +1,5 @@
+set(CMAKE_SYSTEM_PROCESSOR powerpc64)
+set(triple powerpc64-linux-gnu)
+
+include(${CMAKE_CURRENT_LIST_DIR}/gcc.cmake)
+
diff --git a/.github/workflows/cross-ppc.yml b/.github/workflows/cross-ppc.yml
index 70695b0c6..49114b7bb 100644
--- a/.github/workflows/cross-ppc.yml
+++ b/.github/workflows/cross-ppc.yml
@@ -10,7 +10,7 @@ jobs:
     strategy:
       matrix:
         target:
-          - { platform: 'ppc',     dir: 'powerpc-linux-gnu',   flags: '-maltivec -mvsx', full: 'OFF' }
+          - { platform: 'ppc64',     dir: 'powerpc64-linux-gnu',   flags: '-maltivec -mvsx -mcpu=power8', full: 'OFF' }
         sys:
           - { compiler: 'gcc',   version: '12' }
     steps:
@@ -39,5 +39,5 @@ jobs:
       run: cmake --build _build --verbose -j1
     - name: Testing xsimd
       run: |
-        qemu-${{ matrix.target.platform }} -cpu 7457 -L /usr/${{ matrix.target.dir}}/ ./test/test_xsimd
+        #qemu-${{ matrix.target.platform }} -cpu power8 -L /usr/${{ matrix.target.dir}}/ ./test/test_xsimd
       working-directory: ${{ github.workspace }}/_build
diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp
index 27d54d9df..891c266f7 100644
--- a/include/xsimd/arch/xsimd_altivec.hpp
+++ b/include/xsimd/arch/xsimd_altivec.hpp
@@ -456,23 +456,15 @@ namespace xsimd
             return vec_perm(lo, hi, vec_lvsl(0, mem));
         }
 
-#if 0
         // load_complex
         namespace detail
         {
-            // Redefine these methods in the SSE-based archs if required
             template <class A>
             XSIMD_INLINE batch<std::complex<float>, A> load_complex(batch<float, A> const& hi, batch<float, A> const& lo, requires_arch<altivec>) noexcept
             {
-                return { _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(2, 0, 2, 0)), _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(3, 1, 3, 1)) };
-            }
-            template <class A>
-            XSIMD_INLINE batch<std::complex<double>, A> load_complex(batch<double, A> const& hi, batch<double, A> const& lo, requires_arch<altivec>) noexcept
-            {
-                return { _mm_shuffle_pd(hi, lo, _MM_SHUFFLE2(0, 0)), _mm_shuffle_pd(hi, lo, _MM_SHUFFLE2(1, 1)) };
+                return { vec_mergee(hi.data, lo.data), vec_mergeo(hi.data, lo.data) };
             }
         }
-#endif
 
         // le
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 8bb46afda..ac4af1406 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -131,7 +131,7 @@ set(XSIMD_TESTS
     main.cpp
     test_api.cpp
     test_arch.cpp
-    #test_basic_math.cpp
+    test_basic_math.cpp
     test_batch.cpp
     test_batch_bool.cpp
     test_batch_cast.cpp

From de42edc3f981dcc5459866ca39badf7f0889ad05 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Thu, 17 Jul 2025 15:25:57 +0200
Subject: [PATCH 42/57] WIP

---
 test/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index ac4af1406..cde63cd56 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -136,7 +136,7 @@ set(XSIMD_TESTS
     test_batch_bool.cpp
     test_batch_cast.cpp
     test_batch_complex.cpp
-    #    test_batch_float.cpp
+    test_batch_float.cpp
     #    test_batch_int.cpp
     #    test_bitwise_cast.cpp
     #    test_batch_constant.cpp

From 82da5e5fa8f68712e897d09be65e7ac7b5e9a712 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Fri, 18 Jul 2025 01:03:33 +0200
Subject: [PATCH 43/57] WIP

---
 include/xsimd/arch/xsimd_altivec.hpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp
index 891c266f7..6d7b76e6a 100644
--- a/include/xsimd/arch/xsimd_altivec.hpp
+++ b/include/xsimd/arch/xsimd_altivec.hpp
@@ -413,8 +413,8 @@ namespace xsimd
             auto tmp3 = vec_mergeo(row[2].data, row[3].data); // v21 v31 v23 v33
             auto tmp5 = vec_add(tmp0, tmp1); // (v20 + v21, v30 + v31, v22 + v23, v32 + v33)
 
-            auto tmp6 = vec_permi(tmp4, tmp5, 0x0); // (v00 + v01, v10 + v11, v20 + v21, v30 + v31
-            auto tmp7 = vec_permi(tmp4, tmp5, 0x3); // (v02 + v03, v12 + v13, v12 + v13, v32 + v33)
+            auto tmp6 = vec_perm(tmp4, tmp5, (__vector unsigned char) { 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 }); // (v00 + v01, v10 + v11, v20 + v21, v30 + v31
+            auto tmp7 = vec_perm(tmp4, tmp5, (__vector unsigned char) { 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31 }); // (v02 + v03, v12 + v13, v12 + v13, v32 + v33)
 
             return vec_add(tmp6, tmp7);
         }

From ff1a2d6d6101b9b56321addecf6e58a0f02c3429 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Fri, 18 Jul 2025 10:12:06 +0200
Subject: [PATCH 44/57] WIP

---
 test/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index cde63cd56..09c350a2a 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -137,9 +137,9 @@ set(XSIMD_TESTS
     test_batch_cast.cpp
     test_batch_complex.cpp
     test_batch_float.cpp
-    #    test_batch_int.cpp
+    test_batch_int.cpp
     #    test_bitwise_cast.cpp
-    #    test_batch_constant.cpp
+    test_batch_constant.cpp
     #    test_batch_manip.cpp
     #    test_complex_exponential.cpp
     #    test_complex_hyperbolic.cpp

From 32becba2fe621f86571f6938b22eb436b6e13608 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Fri, 18 Jul 2025 10:21:10 +0200
Subject: [PATCH 45/57] WIP

---
 test/CMakeLists.txt | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 09c350a2a..e01e3684d 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -138,7 +138,7 @@ set(XSIMD_TESTS
     test_batch_complex.cpp
     test_batch_float.cpp
     test_batch_int.cpp
-    #    test_bitwise_cast.cpp
+    test_bitwise_cast.cpp
     test_batch_constant.cpp
     #    test_batch_manip.cpp
     #    test_complex_exponential.cpp
@@ -153,7 +153,7 @@ set(XSIMD_TESTS
     #    test_extract_pair.cpp
     #    test_fp_manipulation.cpp
     #    test_hyperbolic.cpp
-    #    test_load_store.cpp
+    test_load_store.cpp
     #    test_memory.cpp
     #    test_poly_evaluation.cpp
     #    test_power.cpp
@@ -161,8 +161,8 @@ set(XSIMD_TESTS
     #    test_select.cpp
     #    test_shuffle.cpp
     test_sum.cpp
-    #    test_traits.cpp
-    #    test_trigonometric.cpp
+    test_traits.cpp
+    test_trigonometric.cpp
     #    test_xsimd_api.cpp
     test_utils.hpp
 )

From 4e8d2702d0902f8a0bb2b2d359aba8a916fdbc70 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Fri, 18 Jul 2025 10:37:09 +0200
Subject: [PATCH 46/57] WIP

---
 include/xsimd/arch/xsimd_altivec.hpp |   2 +-
 test/CMakeLists.txt                  |   8 +-
 test/test_bitwise_cast.cpp           | 108 ++++++++++++++++-----------
 3 files changed, 71 insertions(+), 47 deletions(-)

diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp
index 6d7b76e6a..7cbb861c9 100644
--- a/include/xsimd/arch/xsimd_altivec.hpp
+++ b/include/xsimd/arch/xsimd_altivec.hpp
@@ -430,7 +430,7 @@ namespace xsimd
         template <class A, class T, size_t I, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I> pos, requires_arch<altivec>) noexcept
         {
-            return vec_insert(val, self.data, pos);
+            return vec_insert(val, self.data, I);
         }
 
         // isnan
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index e01e3684d..883d2b39f 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -141,10 +141,10 @@ set(XSIMD_TESTS
     test_bitwise_cast.cpp
     test_batch_constant.cpp
     #    test_batch_manip.cpp
-    #    test_complex_exponential.cpp
-    #    test_complex_hyperbolic.cpp
-    #    test_complex_power.cpp
-    #    test_complex_trigonometric.cpp
+    test_complex_exponential.cpp
+    test_complex_hyperbolic.cpp
+    test_complex_power.cpp
+    test_complex_trigonometric.cpp
     #    test_conversion.cpp
     #    test_custom_default_arch.cpp
     #    test_error_gamma.cpp
diff --git a/test/test_bitwise_cast.cpp b/test/test_bitwise_cast.cpp
index 59e19cfdb..8efedeadf 100644
--- a/test/test_bitwise_cast.cpp
+++ b/test/test_bitwise_cast.cpp
@@ -21,37 +21,44 @@ struct bitwise_cast_test
     static constexpr size_t N = CP::size;
 
     using int32_batch = xsimd::batch<int32_t>;
-    using int64_batch = xsimd::batch<int64_t>;
     using float_batch = xsimd::batch<float>;
-    using double_batch = xsimd::batch<double>;
 
     using int32_vector = std::vector<int32_t, xsimd::default_allocator<int32_t>>;
-    using int64_vector = std::vector<int64_t, xsimd::default_allocator<int64_t>>;
     using float_vector = std::vector<float, xsimd::default_allocator<float>>;
-    using double_vector = std::vector<double, xsimd::default_allocator<double>>;
 
     int32_vector ftoi32_res;
-    int32_vector dtoi32_res;
-    int64_vector ftoi64_res;
-    int64_vector dtoi64_res;
     float_vector i32tof_res;
+
+#ifndef XSIMD_WITH_ALTIVEC
+    using int64_batch = xsimd::batch<int64_t>;
+    using double_batch = xsimd::batch<double>;
+
+    using int64_vector = std::vector<int64_t, xsimd::default_allocator<int64_t>>;
+    using double_vector = std::vector<double, xsimd::default_allocator<double>>;
+
+    int32_vector dtoi32_res;
     float_vector i64tof_res;
     float_vector dtof_res;
+    int64_vector ftoi64_res;
+    int64_vector dtoi64_res;
     double_vector i32tod_res;
     double_vector i64tod_res;
     double_vector ftod_res;
+#endif
 
     bitwise_cast_test()
         : ftoi32_res(2 * N)
-        , dtoi32_res(2 * N)
-        , ftoi64_res(N)
-        , dtoi64_res(N)
         , i32tof_res(2 * N)
+#ifndef XSIMD_WITH_ALTIVEC
+        , dtoi32_res(2 * N)
         , i64tof_res(2 * N)
         , dtof_res(2 * N)
+        , ftoi64_res(N)
+        , dtoi64_res(N)
         , i32tod_res(N)
         , i64tod_res(N)
         , ftod_res(N)
+#endif
     {
         {
             int32_batch input = i32_input();
@@ -59,8 +66,22 @@ struct bitwise_cast_test
             b.i32[0] = input.get(0);
             b.i32[1] = input.get(1);
             std::fill(i32tof_res.begin(), i32tof_res.end(), b.f[0]);
+#ifndef XSIMD_WITH_ALTIVEC
             std::fill(i32tod_res.begin(), i32tod_res.end(), b.d);
+#endif
+        }
+        {
+            float_batch input = f_input();
+            bitcast b;
+            b.f[0] = input.get(0);
+            b.f[1] = input.get(1);
+            std::fill(ftoi32_res.begin(), ftoi32_res.end(), b.i32[0]);
+#ifndef XSIMD_WITH_ALTIVEC
+            std::fill(ftoi64_res.begin(), ftoi64_res.end(), b.i64);
+            std::fill(ftod_res.begin(), ftod_res.end(), b.d);
+#endif
         }
+#ifndef XSIMD_WITH_ALTIVEC
         {
             int64_batch input = i64_input();
             bitcast b;
@@ -72,15 +93,6 @@ struct bitwise_cast_test
                 i64tof_res[2 * i + 1] = b.f[1];
             }
         }
-        {
-            float_batch input = f_input();
-            bitcast b;
-            b.f[0] = input.get(0);
-            b.f[1] = input.get(1);
-            std::fill(ftoi32_res.begin(), ftoi32_res.end(), b.i32[0]);
-            std::fill(ftoi64_res.begin(), ftoi64_res.end(), b.i64);
-            std::fill(ftod_res.begin(), ftod_res.end(), b.d);
-        }
         {
             double_batch input = d_input();
             bitcast b;
@@ -95,6 +107,7 @@ struct bitwise_cast_test
                 dtof_res[2 * i + 1] = b.f[1];
             }
         }
+#endif
     }
 
     void test_to_int32()
@@ -106,29 +119,14 @@ struct bitwise_cast_test
             INFO("to_int32(float)");
             CHECK_VECTOR_EQ(i32vres, ftoi32_res);
         }
+#ifndef XSIMD_WITH_ALTIVEC
         {
             int32_batch i32bres = xsimd::bitwise_cast<int32_t>(d_input());
             i32bres.store_aligned(i32vres.data());
             INFO("to_int32(double)");
             CHECK_VECTOR_EQ(i32vres, dtoi32_res);
         }
-    }
-
-    void test_to_int64()
-    {
-        int64_vector i64vres(int64_batch::size);
-        {
-            int64_batch i64bres = xsimd::bitwise_cast<int64_t>(f_input());
-            i64bres.store_aligned(i64vres.data());
-            INFO("to_int64(float)");
-            CHECK_VECTOR_EQ(i64vres, ftoi64_res);
-        }
-        {
-            int64_batch i64bres = xsimd::bitwise_cast<int64_t>(d_input());
-            i64bres.store_aligned(i64vres.data());
-            INFO("to_int64(double)");
-            CHECK_VECTOR_EQ(i64vres, dtoi64_res);
-        }
+#endif
     }
 
     void test_to_float()
@@ -140,6 +138,7 @@ struct bitwise_cast_test
             INFO("to_float(int32_t)");
             CHECK_VECTOR_EQ(fvres, i32tof_res);
         }
+#ifndef XSIMD_WITH_ALTIVEC
         {
             float_batch fbres = xsimd::bitwise_cast<float>(i64_input());
             fbres.store_aligned(fvres.data());
@@ -152,6 +151,26 @@ struct bitwise_cast_test
             INFO("to_float(double)");
             CHECK_VECTOR_EQ(fvres, dtof_res);
         }
+#endif
+    }
+
+#ifndef XSIMD_WITH_ALTIVEC
+
+    void test_to_int64()
+    {
+        int64_vector i64vres(int64_batch::size);
+        {
+            int64_batch i64bres = xsimd::bitwise_cast<int64_t>(f_input());
+            i64bres.store_aligned(i64vres.data());
+            INFO("to_int64(float)");
+            CHECK_VECTOR_EQ(i64vres, ftoi64_res);
+        }
+        {
+            int64_batch i64bres = xsimd::bitwise_cast<int64_t>(d_input());
+            i64bres.store_aligned(i64vres.data());
+            INFO("to_int64(double)");
+            CHECK_VECTOR_EQ(i64vres, dtoi64_res);
+        }
     }
 
     void test_to_double()
@@ -176,6 +195,7 @@ struct bitwise_cast_test
             CHECK_VECTOR_EQ(dvres, ftod_res);
         }
     }
+#endif
 
 private:
     int32_batch i32_input() const
@@ -183,20 +203,22 @@ struct bitwise_cast_test
         return int32_batch(2);
     }
 
-    int64_batch i64_input() const
+    float_batch f_input() const
     {
-        return int64_batch(2);
+        return float_batch(3.);
     }
 
-    float_batch f_input() const
+#ifndef XSIMD_WITH_ALTIVEC
+    int64_batch i64_input() const
     {
-        return float_batch(3.);
+        return int64_batch(2);
     }
 
     double_batch d_input() const
     {
         return double_batch(2.5e17);
     }
+#endif
 
     union bitcast
     {
@@ -212,11 +234,13 @@ TEST_CASE_TEMPLATE("[bitwise cast]", B, CONVERSION_TYPES)
     bitwise_cast_test<B> Test;
     SUBCASE("to_int32") { Test.test_to_int32(); }
 
-    SUBCASE("to_int64") { Test.test_to_int64(); }
-
     SUBCASE("to_float") { Test.test_to_float(); }
 
+#ifndef XSIMD_WITH_ALTIVEC
+    SUBCASE("to_int64") { Test.test_to_int64(); }
+
     SUBCASE("to_double") { Test.test_to_double(); }
+#endif
 }
 #endif
 #endif

From d0be84bcb5e1800366647b983345acba20436aaa Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Fri, 18 Jul 2025 12:13:36 +0200
Subject: [PATCH 47/57] WIP

---
 test/test_load_store.cpp | 28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/test/test_load_store.cpp b/test/test_load_store.cpp
index 7a4d80932..8da992b7d 100644
--- a/test/test_load_store.cpp
+++ b/test/test_load_store.cpp
@@ -32,14 +32,18 @@ struct load_store_test
     using uint16_vector_type = std::vector<uint16_t, allocator<uint16_t>>;
     using int32_vector_type = std::vector<int32_t, allocator<int32_t>>;
     using uint32_vector_type = std::vector<uint32_t, allocator<uint32_t>>;
+#ifndef XSIMD_WITH_ALTIVEC
     using int64_vector_type = std::vector<int64_t, allocator<int64_t>>;
     using uint64_vector_type = std::vector<uint64_t, allocator<uint64_t>>;
+#endif
 #ifdef XSIMD_32_BIT_ABI
     using long_vector_type = std::vector<long, allocator<long>>;
     using ulong_vector_type = std::vector<unsigned long, allocator<unsigned long>>;
 #endif
     using float_vector_type = std::vector<float, allocator<float>>;
+#if (!XSIMD_WITH_NEON && !XSIMD_WITH_ALTIVEC) || XSIMD_WITH_NEON64
     using double_vector_type = std::vector<double, allocator<double>>;
+#endif
 
     int8_vector_type i8_vec;
     uint8_vector_type ui8_vec;
@@ -47,14 +51,18 @@ struct load_store_test
     uint16_vector_type ui16_vec;
     int32_vector_type i32_vec;
     uint32_vector_type ui32_vec;
+#ifndef XSIMD_WITH_ALTIVEC
     int64_vector_type i64_vec;
     uint64_vector_type ui64_vec;
+#endif
 #ifdef XSIMD_32_BIT_ABI
     long_vector_type l_vec;
     ulong_vector_type ul_vec;
 #endif
     float_vector_type f_vec;
+#if (!XSIMD_WITH_NEON && !XSIMD_WITH_ALTIVEC) || XSIMD_WITH_NEON64
     double_vector_type d_vec;
+#endif
 
     array_type expected;
 
@@ -66,14 +74,18 @@ struct load_store_test
         init_test_vector(ui16_vec);
         init_test_vector(i32_vec);
         init_test_vector(ui32_vec);
+#ifndef XSIMD_WITH_ALTIVEC
         init_test_vector(i64_vec);
         init_test_vector(ui64_vec);
+#endif
 #ifdef XSIMD_32_BIT_ABI
         init_test_vector(l_vec);
         init_test_vector(ul_vec);
 #endif
         init_test_vector(f_vec);
+#if (!XSIMD_WITH_NEON && !XSIMD_WITH_ALTIVEC) || XSIMD_WITH_NEON64
         init_test_vector(d_vec);
+#endif
     }
 
     void test_load()
@@ -84,14 +96,16 @@ struct load_store_test
         test_load_impl(ui16_vec, "load uint16_t");
         test_load_impl(i32_vec, "load int32_t");
         test_load_impl(ui32_vec, "load uint32_t");
+#ifndef XSIMD_WITH_ALTIVEC
         test_load_impl(i64_vec, "load int64_t");
         test_load_impl(ui64_vec, "load uint64_t");
+#endif
 #ifdef XSIMD_32_BIT_ABI
         test_load_impl(l_vec, "load long");
         test_load_impl(ul_vec, "load unsigned long");
 #endif
         test_load_impl(f_vec, "load float");
-#if !XSIMD_WITH_NEON || XSIMD_WITH_NEON64
+#if (!XSIMD_WITH_NEON && !XSIMD_WITH_ALTIVEC) || XSIMD_WITH_NEON64
         test_load_impl(d_vec, "load double");
 #endif
     }
@@ -104,14 +118,16 @@ struct load_store_test
         test_store_impl(ui16_vec, "load uint16_t");
         test_store_impl(i32_vec, "load int32_t");
         test_store_impl(ui32_vec, "load uint32_t");
+#ifndef XSIMD_WITH_ALTIVEC
         test_store_impl(i64_vec, "load int64_t");
         test_store_impl(ui64_vec, "load uint64_t");
+#endif
 #ifdef XSIMD_32_BIT_ABI
         test_store_impl(l_vec, "load long");
         test_store_impl(ul_vec, "load unsigned long");
 #endif
         test_store_impl(f_vec, "load float");
-#if !XSIMD_WITH_NEON || XSIMD_WITH_NEON64
+#if (!XSIMD_WITH_NEON && !XSIMD_WITH_ALTIVEC) || XSIMD_WITH_NEON64
         test_store_impl(d_vec, "load double");
 #endif
     }
@@ -123,15 +139,17 @@ struct load_store_test
         test_gather_impl(ui16_vec, "gather uint16_t");
         test_gather_impl(i32_vec, "gather int32_t");
         test_gather_impl(ui32_vec, "gather uint32_t");
+#ifndef XSIMD_WITH_ALTIVEC
         test_gather_impl(i64_vec, "gather int64_t");
         test_gather_impl(ui64_vec, "gather uint64_t");
+#endif
 #ifdef XSIMD_32_BIT_ABI
         test_gather_impl(l_vec, "gather long");
         test_gather_impl(ul_vec, "gather unsigned long");
 #endif
         test_gather_impl(f_vec, "gather float");
         test_gather_impl(f_vec, "gather float");
-#if !XSIMD_WITH_NEON || XSIMD_WITH_NEON64
+#if (!XSIMD_WITH_NEON && !XSIMD_WITH_ALTIVEC) || XSIMD_WITH_NEON64
         test_gather_impl(d_vec, "gather double");
 #endif
     }
@@ -144,14 +162,16 @@ struct load_store_test
         test_scatter_impl(ui16_vec, "scatter uint16_t");
         test_scatter_impl(i32_vec, "scatter int32_t");
         test_scatter_impl(ui32_vec, "scatter uint32_t");
+#ifndef XSIMD_WITH_ALTIVEC
         test_scatter_impl(i64_vec, "scatter int64_t");
         test_scatter_impl(ui64_vec, "scatter uint64_t");
+#endif
 #ifdef XSIMD_32_BIT_ABI
         test_scatter_impl(l_vec, "scatter long");
         test_scatter_impl(ul_vec, "scatter unsigned long");
 #endif
         test_scatter_impl(f_vec, "scatter float");
-#if !XSIMD_WITH_NEON || XSIMD_WITH_NEON64
+#if (!XSIMD_WITH_NEON && !XSIMD_WITH_ALTIVEC) || XSIMD_WITH_NEON64
         test_scatter_impl(d_vec, "scatter double");
 #endif
     }

From 209d0c5a0c5cf56e80283c822077f6c4f8c316e9 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Fri, 18 Jul 2025 12:39:08 +0200
Subject: [PATCH 48/57] WIP

---
 test/CMakeLists.txt                        |  8 ++++----
 test/test_batch_cast.cpp                   |  6 +++---
 test/test_bitwise_cast.cpp                 | 20 ++++++++++----------
 test/test_explicit_batch_instantiation.cpp |  4 +++-
 test/test_load_store.cpp                   | 14 +++++++-------
 5 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 883d2b39f..3e2840076 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -147,10 +147,10 @@ set(XSIMD_TESTS
     test_complex_trigonometric.cpp
     #    test_conversion.cpp
     #    test_custom_default_arch.cpp
-    #    test_error_gamma.cpp
-    #    test_explicit_batch_instantiation.cpp
-    #    test_exponential.cpp
-    #    test_extract_pair.cpp
+    test_error_gamma.cpp
+    test_explicit_batch_instantiation.cpp
+    test_exponential.cpp
+    test_extract_pair.cpp
     #    test_fp_manipulation.cpp
     #    test_hyperbolic.cpp
     test_load_store.cpp
diff --git a/test/test_batch_cast.cpp b/test/test_batch_cast.cpp
index 5d84176d3..e2e43a2ac 100644
--- a/test/test_batch_cast.cpp
+++ b/test/test_batch_cast.cpp
@@ -71,7 +71,7 @@ struct batch_cast_test
     using int32_batch = xsimd::batch<int32_t>;
     using uint32_batch = xsimd::batch<uint32_t>;
     using float_batch = xsimd::batch<float>;
-#ifndef XSIMD_WITH_ALTIVEC
+#if !XSIMD_WITH_ALTIVEC
     using int64_batch = xsimd::batch<int64_t>;
     using uint64_batch = xsimd::batch<uint64_t>;
     using double_batch = xsimd::batch<double>;
@@ -185,7 +185,7 @@ struct batch_cast_test
             test_cast_impl<uint32_batch, uint32_batch>(test_value, "batch cast uint32 -> uint32");
             test_cast_impl<uint32_batch, float_batch>(test_value, "batch cast uint32 -> float");
 
-#ifndef XSIMD_WITH_ALTIVEC
+#if !XSIMD_WITH_ALTIVEC
             test_cast_impl<int64_batch, int64_batch>(test_value, "batch cast int64 -> int64");
             test_cast_impl<int64_batch, uint64_batch>(test_value, "batch cast int64 -> uint64");
             test_cast_impl<int64_batch, double_batch>(test_value, "batch cast int64 -> double");
@@ -202,7 +202,7 @@ struct batch_cast_test
             test_cast_impl<float_batch, float_batch>(test_value, "batch cast float -> float");
         }
 
-#ifndef XSIMD_WITH_ALTIVEC
+#if !XSIMD_WITH_ALTIVEC
         for (const auto& test_value : double_test_values)
         {
             test_cast_impl<double_batch, int64_batch>(test_value, "batch cast double -> int64");
diff --git a/test/test_bitwise_cast.cpp b/test/test_bitwise_cast.cpp
index 8efedeadf..ac9b5f050 100644
--- a/test/test_bitwise_cast.cpp
+++ b/test/test_bitwise_cast.cpp
@@ -29,7 +29,7 @@ struct bitwise_cast_test
     int32_vector ftoi32_res;
     float_vector i32tof_res;
 
-#ifndef XSIMD_WITH_ALTIVEC
+#if !XSIMD_WITH_ALTIVEC
     using int64_batch = xsimd::batch<int64_t>;
     using double_batch = xsimd::batch<double>;
 
@@ -49,7 +49,7 @@ struct bitwise_cast_test
     bitwise_cast_test()
         : ftoi32_res(2 * N)
         , i32tof_res(2 * N)
-#ifndef XSIMD_WITH_ALTIVEC
+#if !XSIMD_WITH_ALTIVEC
         , dtoi32_res(2 * N)
         , i64tof_res(2 * N)
         , dtof_res(2 * N)
@@ -66,7 +66,7 @@ struct bitwise_cast_test
             b.i32[0] = input.get(0);
             b.i32[1] = input.get(1);
             std::fill(i32tof_res.begin(), i32tof_res.end(), b.f[0]);
-#ifndef XSIMD_WITH_ALTIVEC
+#if !XSIMD_WITH_ALTIVEC
             std::fill(i32tod_res.begin(), i32tod_res.end(), b.d);
 #endif
         }
@@ -76,12 +76,12 @@ struct bitwise_cast_test
             b.f[0] = input.get(0);
             b.f[1] = input.get(1);
             std::fill(ftoi32_res.begin(), ftoi32_res.end(), b.i32[0]);
-#ifndef XSIMD_WITH_ALTIVEC
+#if !XSIMD_WITH_ALTIVEC
             std::fill(ftoi64_res.begin(), ftoi64_res.end(), b.i64);
             std::fill(ftod_res.begin(), ftod_res.end(), b.d);
 #endif
         }
-#ifndef XSIMD_WITH_ALTIVEC
+#if !XSIMD_WITH_ALTIVEC
         {
             int64_batch input = i64_input();
             bitcast b;
@@ -119,7 +119,7 @@ struct bitwise_cast_test
             INFO("to_int32(float)");
             CHECK_VECTOR_EQ(i32vres, ftoi32_res);
         }
-#ifndef XSIMD_WITH_ALTIVEC
+#if !XSIMD_WITH_ALTIVEC
         {
             int32_batch i32bres = xsimd::bitwise_cast<int32_t>(d_input());
             i32bres.store_aligned(i32vres.data());
@@ -138,7 +138,7 @@ struct bitwise_cast_test
             INFO("to_float(int32_t)");
             CHECK_VECTOR_EQ(fvres, i32tof_res);
         }
-#ifndef XSIMD_WITH_ALTIVEC
+#if !XSIMD_WITH_ALTIVEC
         {
             float_batch fbres = xsimd::bitwise_cast<float>(i64_input());
             fbres.store_aligned(fvres.data());
@@ -154,7 +154,7 @@ struct bitwise_cast_test
 #endif
     }
 
-#ifndef XSIMD_WITH_ALTIVEC
+#if !XSIMD_WITH_ALTIVEC
 
     void test_to_int64()
     {
@@ -208,7 +208,7 @@ struct bitwise_cast_test
         return float_batch(3.);
     }
 
-#ifndef XSIMD_WITH_ALTIVEC
+#if !XSIMD_WITH_ALTIVEC
     int64_batch i64_input() const
     {
         return int64_batch(2);
@@ -236,7 +236,7 @@ TEST_CASE_TEMPLATE("[bitwise cast]", B, CONVERSION_TYPES)
 
     SUBCASE("to_float") { Test.test_to_float(); }
 
-#ifndef XSIMD_WITH_ALTIVEC
+#if !XSIMD_WITH_ALTIVEC
     SUBCASE("to_int64") { Test.test_to_int64(); }
 
     SUBCASE("to_double") { Test.test_to_double(); }
diff --git a/test/test_explicit_batch_instantiation.cpp b/test/test_explicit_batch_instantiation.cpp
index f988a6e06..290adc63e 100644
--- a/test/test_explicit_batch_instantiation.cpp
+++ b/test/test_explicit_batch_instantiation.cpp
@@ -22,10 +22,12 @@ namespace xsimd
     template class batch<signed short>;
     template class batch<unsigned int>;
     template class batch<signed int>;
+#if !XSIMD_WITH_ALTIVEC
     template class batch<unsigned long>;
     template class batch<signed long>;
+#endif
     template class batch<float>;
-#if !XSIMD_WITH_NEON || XSIMD_WITH_NEON64
+#if (!XSIMD_WITH_NEON && !XSIMD_WITH_ALTIVEC) || XSIMD_WITH_NEON64
     template class batch<double>;
 #endif
 }
diff --git a/test/test_load_store.cpp b/test/test_load_store.cpp
index 8da992b7d..0d149c7fa 100644
--- a/test/test_load_store.cpp
+++ b/test/test_load_store.cpp
@@ -32,7 +32,7 @@ struct load_store_test
     using uint16_vector_type = std::vector<uint16_t, allocator<uint16_t>>;
     using int32_vector_type = std::vector<int32_t, allocator<int32_t>>;
     using uint32_vector_type = std::vector<uint32_t, allocator<uint32_t>>;
-#ifndef XSIMD_WITH_ALTIVEC
+#if !XSIMD_WITH_ALTIVEC
     using int64_vector_type = std::vector<int64_t, allocator<int64_t>>;
     using uint64_vector_type = std::vector<uint64_t, allocator<uint64_t>>;
 #endif
@@ -51,7 +51,7 @@ struct load_store_test
     uint16_vector_type ui16_vec;
     int32_vector_type i32_vec;
     uint32_vector_type ui32_vec;
-#ifndef XSIMD_WITH_ALTIVEC
+#if !XSIMD_WITH_ALTIVEC
     int64_vector_type i64_vec;
     uint64_vector_type ui64_vec;
 #endif
@@ -74,7 +74,7 @@ struct load_store_test
         init_test_vector(ui16_vec);
         init_test_vector(i32_vec);
         init_test_vector(ui32_vec);
-#ifndef XSIMD_WITH_ALTIVEC
+#if !XSIMD_WITH_ALTIVEC
         init_test_vector(i64_vec);
         init_test_vector(ui64_vec);
 #endif
@@ -96,7 +96,7 @@ struct load_store_test
         test_load_impl(ui16_vec, "load uint16_t");
         test_load_impl(i32_vec, "load int32_t");
         test_load_impl(ui32_vec, "load uint32_t");
-#ifndef XSIMD_WITH_ALTIVEC
+#if !XSIMD_WITH_ALTIVEC
         test_load_impl(i64_vec, "load int64_t");
         test_load_impl(ui64_vec, "load uint64_t");
 #endif
@@ -118,7 +118,7 @@ struct load_store_test
         test_store_impl(ui16_vec, "load uint16_t");
         test_store_impl(i32_vec, "load int32_t");
         test_store_impl(ui32_vec, "load uint32_t");
-#ifndef XSIMD_WITH_ALTIVEC
+#if !XSIMD_WITH_ALTIVEC
         test_store_impl(i64_vec, "load int64_t");
         test_store_impl(ui64_vec, "load uint64_t");
 #endif
@@ -139,7 +139,7 @@ struct load_store_test
         test_gather_impl(ui16_vec, "gather uint16_t");
         test_gather_impl(i32_vec, "gather int32_t");
         test_gather_impl(ui32_vec, "gather uint32_t");
-#ifndef XSIMD_WITH_ALTIVEC
+#if !XSIMD_WITH_ALTIVEC
         test_gather_impl(i64_vec, "gather int64_t");
         test_gather_impl(ui64_vec, "gather uint64_t");
 #endif
@@ -162,7 +162,7 @@ struct load_store_test
         test_scatter_impl(ui16_vec, "scatter uint16_t");
         test_scatter_impl(i32_vec, "scatter int32_t");
         test_scatter_impl(ui32_vec, "scatter uint32_t");
-#ifndef XSIMD_WITH_ALTIVEC
+#if !XSIMD_WITH_ALTIVEC
         test_scatter_impl(i64_vec, "scatter int64_t");
         test_scatter_impl(ui64_vec, "scatter uint64_t");
 #endif

From 6bb983045d3543905b704f98d15670101276197e Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Fri, 18 Jul 2025 13:28:34 +0200
Subject: [PATCH 49/57] WIP

---
 include/xsimd/arch/xsimd_altivec.hpp | 8 ++++----
 test/CMakeLists.txt                  | 6 +++---
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp
index 7cbb861c9..a6680e2e8 100644
--- a/include/xsimd/arch/xsimd_altivec.hpp
+++ b/include/xsimd/arch/xsimd_altivec.hpp
@@ -444,15 +444,15 @@ namespace xsimd
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> load_aligned(T const* mem, convert<T>, requires_arch<altivec>) noexcept
         {
-            return vec_ld(0, mem);
+            return vec_ld(0, reinterpret_cast<const typename batch<T, A>::register_type*>(mem));
         }
 
         // load_unaligned
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<altivec>) noexcept
         {
-            auto lo = vec_ld(0, mem);
-            auto hi = vec_ld(16, mem);
+            auto lo = vec_ld(0, reinterpret_cast<const typename batch<T, A>::register_type*>(mem));
+            auto hi = vec_ld(16, reinterpret_cast<const typename batch<T, A>::register_type*>(mem));
             return vec_perm(lo, hi, vec_lvsl(0, mem));
         }
 
@@ -794,7 +794,7 @@ namespace xsimd
         template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE void store_aligned(T* mem, batch<T, A> const& self, requires_arch<altivec>) noexcept
         {
-            return vec_st(self.data, 0, mem);
+            return vec_st(self.data, 0, reinterpret_cast<typename batch<T, A>::register_type*>(mem));
         }
 
         // store_unaligned
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 3e2840076..bd6b98ef8 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -146,15 +146,15 @@ set(XSIMD_TESTS
     test_complex_power.cpp
     test_complex_trigonometric.cpp
     #    test_conversion.cpp
-    #    test_custom_default_arch.cpp
+    test_custom_default_arch.cpp
     test_error_gamma.cpp
     test_explicit_batch_instantiation.cpp
     test_exponential.cpp
     test_extract_pair.cpp
     #    test_fp_manipulation.cpp
-    #    test_hyperbolic.cpp
+    test_hyperbolic.cpp
     test_load_store.cpp
-    #    test_memory.cpp
+    test_memory.cpp
     #    test_poly_evaluation.cpp
     #    test_power.cpp
     #    test_rounding.cpp

From 86b3615e40468b07ebd05263d2a98c4283af1689 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Sat, 19 Jul 2025 22:04:42 +0200
Subject: [PATCH 50/57] WIP

---
 test/CMakeLists.txt | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index bd6b98ef8..c2db5fa0f 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -155,8 +155,8 @@ set(XSIMD_TESTS
     test_hyperbolic.cpp
     test_load_store.cpp
     test_memory.cpp
-    #    test_poly_evaluation.cpp
-    #    test_power.cpp
+    test_poly_evaluation.cpp
+    test_power.cpp
     #    test_rounding.cpp
     #    test_select.cpp
     #    test_shuffle.cpp

From d65c8edb13a4f0a503c073c4e9238c31510054e3 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Sat, 19 Jul 2025 23:02:05 +0200
Subject: [PATCH 51/57] WIP

---
 test/CMakeLists.txt      | 6 +++---
 test/test_conversion.cpp | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index c2db5fa0f..9e91c2047 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -145,19 +145,19 @@ set(XSIMD_TESTS
     test_complex_hyperbolic.cpp
     test_complex_power.cpp
     test_complex_trigonometric.cpp
-    #    test_conversion.cpp
+    test_conversion.cpp
     test_custom_default_arch.cpp
     test_error_gamma.cpp
     test_explicit_batch_instantiation.cpp
     test_exponential.cpp
     test_extract_pair.cpp
-    #    test_fp_manipulation.cpp
+    test_fp_manipulation.cpp
     test_hyperbolic.cpp
     test_load_store.cpp
     test_memory.cpp
     test_poly_evaluation.cpp
     test_power.cpp
-    #    test_rounding.cpp
+    test_rounding.cpp
     #    test_select.cpp
     #    test_shuffle.cpp
     test_sum.cpp
diff --git a/test/test_conversion.cpp b/test/test_conversion.cpp
index 153920ac0..47950e80e 100644
--- a/test/test_conversion.cpp
+++ b/test/test_conversion.cpp
@@ -14,7 +14,7 @@
 
 #include "test_utils.hpp"
 
-#if !XSIMD_WITH_NEON || XSIMD_WITH_NEON64
+#if (!XSIMD_WITH_NEON && !XSIMD_WITH_ALTIVEC) || XSIMD_WITH_NEON64
 template <class CP>
 struct conversion_test
 {

From 0cb875c89350d4f8c9f58c48b412d15400c0e264 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Mon, 21 Jul 2025 08:04:47 +0200
Subject: [PATCH 52/57] WIP

---
 test/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 9e91c2047..6e6b65556 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -140,7 +140,7 @@ set(XSIMD_TESTS
     test_batch_int.cpp
     test_bitwise_cast.cpp
     test_batch_constant.cpp
-    #    test_batch_manip.cpp
+    test_batch_manip.cpp
     test_complex_exponential.cpp
     test_complex_hyperbolic.cpp
     test_complex_power.cpp

From 88b3e0b700ca54083fadbe5e01baadf7f8ff6c8e Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Mon, 21 Jul 2025 08:25:23 +0200
Subject: [PATCH 53/57] WIP

---
 test/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 6e6b65556..6302eb07c 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -158,7 +158,7 @@ set(XSIMD_TESTS
     test_poly_evaluation.cpp
     test_power.cpp
     test_rounding.cpp
-    #    test_select.cpp
+    test_select.cpp
     #    test_shuffle.cpp
     test_sum.cpp
     test_traits.cpp

From 7e6e837cea58aef18e6e690ee68515db9cf356c9 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Mon, 21 Jul 2025 08:43:08 +0200
Subject: [PATCH 54/57] WIP

---
 test/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 6302eb07c..23c9345bb 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -159,7 +159,7 @@ set(XSIMD_TESTS
     test_power.cpp
     test_rounding.cpp
     test_select.cpp
-    #    test_shuffle.cpp
+    test_shuffle.cpp
     test_sum.cpp
     test_traits.cpp
     test_trigonometric.cpp

From a7f64dc92a58cf99517d7b72a19246fc23eafc7c Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Mon, 21 Jul 2025 09:07:42 +0200
Subject: [PATCH 55/57] WIP

---
 include/xsimd/arch/xsimd_altivec.hpp |  2 +-
 test/CMakeLists.txt                  |  2 +-
 test/test_shuffle.cpp                | 22 +++++++++++++++++++---
 3 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp
index a6680e2e8..5e441f90e 100644
--- a/include/xsimd/arch/xsimd_altivec.hpp
+++ b/include/xsimd/arch/xsimd_altivec.hpp
@@ -932,7 +932,7 @@ namespace xsimd
 
 #endif
         // zip_hi
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
         XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<altivec>) noexcept
         {
             return vec_mergeh(self.data, other.data);
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 23c9345bb..e6bad7999 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -163,7 +163,7 @@ set(XSIMD_TESTS
     test_sum.cpp
     test_traits.cpp
     test_trigonometric.cpp
-    #    test_xsimd_api.cpp
+    test_xsimd_api.cpp
     test_utils.hpp
 )
 
diff --git a/test/test_shuffle.cpp b/test/test_shuffle.cpp
index bc89aafd5..a87428d00 100644
--- a/test/test_shuffle.cpp
+++ b/test/test_shuffle.cpp
@@ -94,6 +94,8 @@ struct zip_test : zip_base<typename B::value_type, B::size>
 
 #if !XSIMD_WITH_AVX512F || XSIMD_WITH_AVX512BW
 #define ZIP_BATCH_TYPES BATCH_TYPES
+#elif XSIMD_WITH_ALTIVEC
+#define ZIP_BATCH_TYPES xsimd::batch<float>, xsimd::batch<int32_t>
 #else
 #define ZIP_BATCH_TYPES xsimd::batch<float>, xsimd::batch<double>, xsimd::batch<int32_t>, xsimd::batch<int64_t>
 #endif
@@ -347,7 +349,13 @@ struct compress_test
     }
 };
 
-TEST_CASE_TEMPLATE("[compress]", B, BATCH_FLOAT_TYPES, xsimd::batch<uint8_t>, xsimd::batch<int8_t>, xsimd::batch<uint16_t>, xsimd::batch<int16_t>, xsimd::batch<uint32_t>, xsimd::batch<int32_t>, xsimd::batch<uint64_t>, xsimd::batch<int64_t>)
+#if XSIMD_WITH_ALTIVEC
+#define XSIMD_COMPRESS_TYPES BATCH_FLOAT_TYPES, xsimd::batch<uint8_t>, xsimd::batch<int8_t>, xsimd::batch<uint16_t>, xsimd::batch<int16_t>, xsimd::batch<uint32_t>, xsimd::batch<int32_t>
+#else
+#define XSIMD_COMPRESS_TYPES BATCH_FLOAT_TYPES, xsimd::batch<uint8_t>, xsimd::batch<int8_t>, xsimd::batch<uint16_t>, xsimd::batch<int16_t>, xsimd::batch<uint32_t>, xsimd::batch<int32_t>, xsimd::batch<uint64_t>, xsimd::batch<int64_t>
+#endif
+
+TEST_CASE_TEMPLATE("[compress]", B, XSIMD_COMPRESS_TYPES)
 {
     compress_test<B> Test;
     SUBCASE("empty")
@@ -443,7 +451,9 @@ struct expand_test
     }
 };
 
-TEST_CASE_TEMPLATE("[expand]", B, BATCH_FLOAT_TYPES, xsimd::batch<uint8_t>, xsimd::batch<int8_t>, xsimd::batch<uint16_t>, xsimd::batch<int16_t>, xsimd::batch<uint32_t>, xsimd::batch<int32_t>, xsimd::batch<uint64_t>, xsimd::batch<int64_t>)
+#define XSIMD_EXPAND_TYPES XSIMD_COMPRESS_TYPES
+
+TEST_CASE_TEMPLATE("[expand]", B, XSIMD_EXPAND_TYPES)
 {
     expand_test<B> Test;
     SUBCASE("empty")
@@ -690,7 +700,13 @@ struct shuffle_test
     }
 };
 
-TEST_CASE_TEMPLATE("[shuffle]", B, BATCH_FLOAT_TYPES, xsimd::batch<uint32_t>, xsimd::batch<int32_t>, xsimd::batch<uint64_t>, xsimd::batch<int64_t>)
+#if XSIMD_WITH_ALTIVEC
+#define XSIMD_SHUFFLE_TYPES BATCH_FLOAT_TYPES, xsimd::batch<uint32_t>, xsimd::batch<int32_t>
+#else
+#define XSIMD_SHUFFLE_TYPES BATCH_FLOAT_TYPES, xsimd::batch<uint32_t>, xsimd::batch<int32_t>, xsimd::batch<uint64_t>, xsimd::batch<int64_t>
+#endif
+
+TEST_CASE_TEMPLATE("[shuffle]", B, XSIMD_SHUFFLE_TYPES)
 {
     shuffle_test<B> Test;
     SUBCASE("no-op")

From 6b975ac3ef6e857fbfd0f1af039af2cd264b3a72 Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Mon, 21 Jul 2025 11:45:12 +0200
Subject: [PATCH 56/57] WIP

---
 include/xsimd/arch/xsimd_altivec.hpp           | 4 ++--
 include/xsimd/types/xsimd_altivec_register.hpp | 2 ++
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/include/xsimd/arch/xsimd_altivec.hpp b/include/xsimd/arch/xsimd_altivec.hpp
index 5e441f90e..41d845cac 100644
--- a/include/xsimd/arch/xsimd_altivec.hpp
+++ b/include/xsimd/arch/xsimd_altivec.hpp
@@ -750,14 +750,14 @@ namespace xsimd
         template <size_t N, class A, class T>
         XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& x, requires_arch<altivec>) noexcept
         {
-            return vec_sll(x.data, vec_splat_u8(N));
+            return vec_sll(x.data, vec_splats((uint32_t)N));
         }
 
         // slide_right
         template <size_t N, class A, class T>
         XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& x, requires_arch<altivec>) noexcept
         {
-            return vec_srl(x.data, vec_splat_u8(N));
+            return vec_srl(x.data, vec_splats((uint32_t)N));
         }
 
         // sadd
diff --git a/include/xsimd/types/xsimd_altivec_register.hpp b/include/xsimd/types/xsimd_altivec_register.hpp
index 2ed7d89ee..36f117122 100644
--- a/include/xsimd/types/xsimd_altivec_register.hpp
+++ b/include/xsimd/types/xsimd_altivec_register.hpp
@@ -64,6 +64,8 @@ namespace xsimd
         XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER(short, short);
         XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER(unsigned int, int);
         XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER(int, int);
+        XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER(unsigned long, long);
+        XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER(long, long);
         XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER(float, int);
 
 #undef XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER

From 51ef120d881ac24608263508dc825f2d38156f5e Mon Sep 17 00:00:00 2001
From: serge-sans-paille <sergesanspaille@free.fr>
Date: Tue, 22 Jul 2025 12:21:40 +0200
Subject: [PATCH 57/57] WIP

---
 include/xsimd/types/xsimd_altivec_register.hpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/xsimd/types/xsimd_altivec_register.hpp b/include/xsimd/types/xsimd_altivec_register.hpp
index 36f117122..4de69ea3c 100644
--- a/include/xsimd/types/xsimd_altivec_register.hpp
+++ b/include/xsimd/types/xsimd_altivec_register.hpp
@@ -67,6 +67,7 @@ namespace xsimd
         XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER(unsigned long, long);
         XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER(long, long);
         XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER(float, int);
+        XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER(double, long);
 
 #undef XSIMD_DECLARE_SIMD_BOOL_ALTIVEC_REGISTER
     }