diff --git a/.github/toolchains/gcc-powerpc64-linux-gnu.cmake b/.github/toolchains/gcc-powerpc64-linux-gnu.cmake
new file mode 100644
index 000000000..5dd97d6c6
--- /dev/null
+++ b/.github/toolchains/gcc-powerpc64-linux-gnu.cmake
@@ -0,0 +1,5 @@
+set(CMAKE_SYSTEM_PROCESSOR powerpc64)
+set(triple powerpc64-linux-gnu)
+
+include(${CMAKE_CURRENT_LIST_DIR}/gcc.cmake)
+
diff --git a/.github/toolchains/gcc-powerpc64le-linux-gnu.cmake b/.github/toolchains/gcc-powerpc64le-linux-gnu.cmake
new file mode 100644
index 000000000..eca1a2837
--- /dev/null
+++ b/.github/toolchains/gcc-powerpc64le-linux-gnu.cmake
@@ -0,0 +1,5 @@
+set(CMAKE_SYSTEM_PROCESSOR powerpc64le)
+set(triple powerpc64le-linux-gnu)
+
+include(${CMAKE_CURRENT_LIST_DIR}/gcc.cmake)
+
diff --git a/.github/workflows/cross.yml b/.github/workflows/cross-arm.yml
similarity index 100%
rename from .github/workflows/cross.yml
rename to .github/workflows/cross-arm.yml
diff --git a/.github/workflows/cross-ppc.yml b/.github/workflows/cross-ppc.yml
new file mode 100644
index 000000000..92ffae333
--- /dev/null
+++ b/.github/workflows/cross-ppc.yml
@@ -0,0 +1,44 @@
+name: PowerPC cross-compilation build
+on: [push, pull_request]
+concurrency:
+  group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }}
+  cancel-in-progress: true
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    name: '${{ matrix.target.arch }}, ${{ matrix.sys.compiler }} ${{ matrix.sys.version }}'
+    strategy:
+      matrix:
+        target:
+          - { platform: 'ppc64le',     dir: 'powerpc64le-linux-gnu',   flags: '-maltivec -mvsx -mcpu=power10', full: 'OFF' }
+          - { platform: 'ppc64',     dir: 'powerpc64-linux-gnu',   flags: '-maltivec -mvsx -mcpu=power10', full: 'OFF' }
+        sys:
+          - { compiler: 'gcc',   version: '12' }
+    steps:
+    - name: Setup compiler
+      if: ${{ matrix.sys.compiler == 'gcc' }}
+      run: |
+        sudo apt-get update || exit 1
+        sudo apt-get --no-install-suggests --no-install-recommends install g++-${{ matrix.sys.version }}-${{ matrix.target.dir }} g++-${{ matrix.sys.version }}-multilib || exit 1
+        sudo update-alternatives --remove-all ${{ matrix.target.dir }}-gcc || true
+        sudo update-alternatives --remove-all ${{ matrix.target.dir }}-g++ || true
+        sudo update-alternatives --install /usr/bin/${{ matrix.target.dir }}-gcc ${{ matrix.target.dir }}-gcc /usr/bin/${{ matrix.target.dir }}-gcc-${{ matrix.sys.version }} 20
+        sudo update-alternatives --install /usr/bin/${{ matrix.target.dir }}-g++ ${{ matrix.target.dir }}-g++ /usr/bin/${{ matrix.target.dir }}-g++-${{ matrix.sys.version }} 20
+    - name: Setup QEMU
+      run: |
+        sudo apt-get --no-install-suggests --no-install-recommends install qemu-user
+    - name: Setup Ninja
+      run: |
+        sudo apt-get install ninja-build
+    - name: Checkout xsimd
+      uses: actions/checkout@v3
+    - name: Setup
+      run: |
+        mkdir _build
+        cd _build && cmake .. -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON -DBUILD_BENCHMARK=${{ matrix.target.full }} -DBUILD_EXAMPLES=${{ matrix.target.full }} -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_FLAGS="${{ matrix.target.flags }}" -DCMAKE_CXX_FLAGS="${{ matrix.target.flags }}" -DCMAKE_TOOLCHAIN_FILE=${{ github.workspace }}/.github/toolchains/${{ matrix.sys.compiler }}-${{ matrix.target.dir }}.cmake
+    - name: Build
+      run: cmake --build _build --verbose -j1
+    - name: Testing xsimd
+      run: |
+        qemu-${{ matrix.target.platform }} -cpu power10 -L /usr/${{ matrix.target.dir}}/ ./test/test_xsimd
+      working-directory: ${{ github.workspace }}/_build
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 860a84bad..6dffce659 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -49,6 +49,7 @@ ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_sse3.hpp
 ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_sse4_1.hpp
 ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_sse4_2.hpp
 ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_ssse3.hpp
+${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_vsx.hpp
 ${XSIMD_INCLUDE_DIR}/xsimd/arch/xsimd_sve.hpp
 ${XSIMD_INCLUDE_DIR}/xsimd/config/xsimd_arch.hpp
 ${XSIMD_INCLUDE_DIR}/xsimd/config/xsimd_config.hpp
@@ -70,6 +71,7 @@ ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_fma3_sse_register.hpp
 ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_fma4_register.hpp
 ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_common_arch.hpp
 ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_register.hpp
+${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_vsx_register.hpp
 ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_rvv_register.hpp
 ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_sse2_register.hpp
 ${XSIMD_INCLUDE_DIR}/xsimd/types/xsimd_sse3_register.hpp
diff --git a/README.md b/README.md
index 87082f488..9313453a3 100644
--- a/README.md
+++ b/README.md
@@ -54,6 +54,7 @@ x86          | AVX512BW, AVX512CD, AVX512DQ, AVX512F (gcc7 and higher)
 x86 AMD      | FMA4
 ARM          | NEON, NEON64, SVE128/256/512 (fixed vector size)
 WebAssembly  | WASM
+powerpc64    | VSX
 RISC-V       | RISC-V128/256/512 (fixed vector size)
 
 ## Installation
diff --git a/docs/Doxyfile b/docs/Doxyfile
index 390baf223..3d0137efe 100644
--- a/docs/Doxyfile
+++ b/docs/Doxyfile
@@ -9,6 +9,7 @@ INPUT             = ../include/xsimd/types/xsimd_api.hpp \
                     ../include/xsimd/memory/xsimd_aligned_allocator.hpp \
                     ../include/xsimd/types/xsimd_common_arch.hpp \
                     ../include/xsimd/types/xsimd_traits.hpp \
+                    ../include/xsimd/types/xsimd_vsx_register.hpp \
                     ../include/xsimd/types/xsimd_avx2_register.hpp \
                     ../include/xsimd/types/xsimd_avx512bw_register.hpp \
                     ../include/xsimd/types/xsimd_avx512cd_register.hpp \
diff --git a/include/xsimd/arch/common/xsimd_common_arithmetic.hpp b/include/xsimd/arch/common/xsimd_common_arithmetic.hpp
index ebef263ee..2e83d33de 100644
--- a/include/xsimd/arch/common/xsimd_common_arithmetic.hpp
+++ b/include/xsimd/arch/common/xsimd_common_arithmetic.hpp
@@ -203,10 +203,9 @@ namespace xsimd
         {
             if (std::is_signed<T>::value)
             {
-                auto mask = (other >> (8 * sizeof(T) - 1));
                 auto self_pos_branch = min(std::numeric_limits<T>::max() - other, self);
                 auto self_neg_branch = max(std::numeric_limits<T>::min() - other, self);
-                return other + select(batch_bool<T, A>(mask.data), self_neg_branch, self_pos_branch);
+                return other + select(other >= 0, self_pos_branch, self_neg_branch);
             }
             else
             {
diff --git a/include/xsimd/arch/common/xsimd_common_math.hpp b/include/xsimd/arch/common/xsimd_common_math.hpp
index 689029aae..cdd35c5dd 100644
--- a/include/xsimd/arch/common/xsimd_common_math.hpp
+++ b/include/xsimd/arch/common/xsimd_common_math.hpp
@@ -1087,7 +1087,7 @@ namespace xsimd
         template <class A, class T>
         XSIMD_INLINE batch<T, A> from_bool(batch_bool<T, A> const& self, requires_arch<common>) noexcept
         {
-            return batch<T, A>(self.data) & batch<T, A>(1);
+            return batch<T, A>((typename batch<T, A>::register_type)self.data) & batch<T, A>(1);
         }
 
         // horner
diff --git a/include/xsimd/arch/xsimd_isa.hpp b/include/xsimd/arch/xsimd_isa.hpp
index 398d22511..1772159a0 100644
--- a/include/xsimd/arch/xsimd_isa.hpp
+++ b/include/xsimd/arch/xsimd_isa.hpp
@@ -132,6 +132,10 @@
 #include "./xsimd_wasm.hpp"
 #endif
 
+#if XSIMD_WITH_VSX
+#include "./xsimd_vsx.hpp"
+#endif
+
 // Must come last to have access to all conversion specializations.
 #include "./xsimd_common.hpp"
 
diff --git a/include/xsimd/arch/xsimd_vsx.hpp b/include/xsimd/arch/xsimd_vsx.hpp
new file mode 100644
index 000000000..48b56f749
--- /dev/null
+++ b/include/xsimd/arch/xsimd_vsx.hpp
@@ -0,0 +1,797 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_VSX_HPP
+#define XSIMD_VSX_HPP
+
+#include <complex>
+#include <limits>
+#include <type_traits>
+
+#include "../types/xsimd_vsx_register.hpp"
+
+#include <endian.h>
+
+namespace xsimd
+{
+    template <typename T, class A, bool... Values>
+    struct batch_bool_constant;
+
+    template <class T_out, class T_in, class A>
+    XSIMD_INLINE batch<T_out, A> bitwise_cast(batch<T_in, A> const& x) noexcept;
+
+    template <typename T, class A, T... Values>
+    struct batch_constant;
+
+    namespace kernel
+    {
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> avg(batch<T, A> const&, batch<T, A> const&, requires_arch<common>) noexcept;
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> avgr(batch<T, A> const&, batch<T, A> const&, requires_arch<common>) noexcept;
+
+        // abs
+        template <class A>
+        XSIMD_INLINE batch<float, A> abs(batch<float, A> const& self, requires_arch<vsx>) noexcept
+        {
+            return vec_abs(self.data);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> abs(batch<double, A> const& self, requires_arch<vsx>) noexcept
+        {
+            return vec_abs(self.data);
+        }
+
+        // add
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<vsx>) noexcept
+        {
+            return vec_add(self.data, other.data);
+        }
+
+        // all
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        XSIMD_INLINE bool all(batch_bool<T, A> const& self, requires_arch<vsx>) noexcept
+        {
+            return vec_all_ne(self.data, vec_xor(self.data, self.data));
+        }
+
+        // any
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        XSIMD_INLINE bool any(batch_bool<T, A> const& self, requires_arch<vsx>) noexcept
+        {
+            return vec_any_ne(self.data, vec_xor(self.data, self.data));
+        }
+
+        // avgr
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value && sizeof(T) < 8, void>::type>
+        XSIMD_INLINE batch<T, A> avgr(batch<T, A> const& self, batch<T, A> const& other, requires_arch<vsx>) noexcept
+        {
+            return vec_avg(self.data, other.data);
+        }
+        template <class A>
+        XSIMD_INLINE batch<float, A> avgr(batch<float, A> const& self, batch<float, A> const& other, requires_arch<vsx>) noexcept
+        {
+            return avgr(self, other, common {});
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> avgr(batch<double, A> const& self, batch<double, A> const& other, requires_arch<vsx>) noexcept
+        {
+            return avgr(self, other, common {});
+        }
+
+        // avg
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> avg(batch<T, A> const& self, batch<T, A> const& other, requires_arch<vsx>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) < 8)
+            {
+                constexpr auto nbit = 8 * sizeof(T) - 1;
+                auto adj = bitwise_cast<T>(bitwise_cast<as_unsigned_integer_t<T>>((self ^ other) << nbit) >> nbit);
+                return avgr(self, other, A {}) - adj;
+            }
+            else
+            {
+                return avg(self, other, common {});
+            }
+        }
+        template <class A>
+        XSIMD_INLINE batch<float, A> avg(batch<float, A> const& self, batch<float, A> const& other, requires_arch<vsx>) noexcept
+        {
+            return avg(self, other, common {});
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> avg(batch<double, A> const& self, batch<double, A> const& other, requires_arch<vsx>) noexcept
+        {
+            return avg(self, other, common {});
+        }
+
+        // batch_bool_cast
+        template <class A, class T_out, class T_in>
+        XSIMD_INLINE batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<vsx>) noexcept
+        {
+            return (typename batch_bool<T_out, A>::register_type)self.data;
+        }
+
+        // bitwise_and
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<vsx>) noexcept
+        {
+            return vec_and(self.data, other.data);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<vsx>) noexcept
+        {
+            return vec_and(self.data, other.data);
+        }
+
+        // bitwise_andnot
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<vsx>) noexcept
+        {
+            return vec_and(self.data, vec_nor(other.data, other.data));
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<vsx>) noexcept
+        {
+            return self.data & ~other.data;
+        }
+
+        // bitwise_lshift
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<vsx>) noexcept
+        {
+            using shift_type = as_unsigned_integer_t<T>;
+            batch<shift_type, A> shift(static_cast<shift_type>(other));
+            return vec_sl(self.data, shift.data);
+        }
+
+        // bitwise_not
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<vsx>) noexcept
+        {
+            return vec_nor(self.data, self.data);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<vsx>) noexcept
+        {
+            return vec_nor(self.data, self.data);
+        }
+
+        // bitwise_or
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<vsx>) noexcept
+        {
+            return vec_or(self.data, other.data);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<vsx>) noexcept
+        {
+            return vec_or(self.data, other.data);
+        }
+
+        // bitwise_rshift
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<vsx>) noexcept
+        {
+            using shift_type = as_unsigned_integer_t<T>;
+            batch<shift_type, A> shift(static_cast<shift_type>(other));
+            XSIMD_IF_CONSTEXPR(std::is_signed<T>::value)
+            {
+                return vec_sra(self.data, shift.data);
+            }
+            else
+            {
+                return vec_sr(self.data, shift.data);
+            }
+        }
+
+        // bitwise_xor
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<vsx>) noexcept
+        {
+            return vec_xor(self.data, other.data);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<vsx>) noexcept
+        {
+            return vec_xor(self.data, other.data);
+        }
+
+        // bitwise_cast
+        template <class A, class T_in, class T_out>
+        XSIMD_INLINE batch<T_out, A> bitwise_cast(batch<T_in, A> const& self, batch<T_out, A> const&, requires_arch<vsx>) noexcept
+        {
+            return (typename batch<T_out, A>::register_type)(self.data);
+        }
+
+        // broadcast
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<vsx>) noexcept
+        {
+            return vec_splats(val);
+        }
+
+        // store_complex
+        namespace detail
+        {
+            // complex_low
+            template <class A>
+            XSIMD_INLINE batch<float, A> complex_low(batch<std::complex<float>, A> const& self, requires_arch<vsx>) noexcept
+            {
+                return vec_mergeh(self.real().data, self.imag().data);
+            }
+            template <class A>
+            XSIMD_INLINE batch<double, A> complex_low(batch<std::complex<double>, A> const& self, requires_arch<vsx>) noexcept
+            {
+                return vec_mergeh(self.real().data, self.imag().data);
+            }
+            // complex_high
+            template <class A>
+            XSIMD_INLINE batch<float, A> complex_high(batch<std::complex<float>, A> const& self, requires_arch<vsx>) noexcept
+            {
+                return vec_mergel(self.real().data, self.imag().data);
+            }
+            template <class A>
+            XSIMD_INLINE batch<double, A> complex_high(batch<std::complex<double>, A> const& self, requires_arch<vsx>) noexcept
+            {
+                return vec_mergel(self.real().data, self.imag().data);
+            }
+        }
+
+        // decr_if
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> decr_if(batch<T, A> const& self, batch_bool<T, A> const& mask, requires_arch<vsx>) noexcept
+        {
+            return self + batch<T, A>((typename batch<T, A>::register_type)mask.data);
+        }
+
+        // div
+        template <class A>
+        XSIMD_INLINE batch<float, A> div(batch<float, A> const& self, batch<float, A> const& other, requires_arch<vsx>) noexcept
+        {
+            return vec_div(self.data, other.data);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> div(batch<double, A> const& self, batch<double, A> const& other, requires_arch<vsx>) noexcept
+        {
+            return vec_div(self.data, other.data);
+        }
+
+        // fast_cast
+        namespace detail
+        {
+            template <class A>
+            XSIMD_INLINE batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&, requires_arch<vsx>) noexcept
+            {
+                return vec_ctf(self.data, 0);
+            }
+            template <class A>
+            XSIMD_INLINE batch<float, A> fast_cast(batch<uint32_t, A> const& self, batch<float, A> const&, requires_arch<vsx>) noexcept
+            {
+                return vec_ctf(self.data, 0);
+            }
+
+            template <class A>
+            XSIMD_INLINE batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<vsx>) noexcept
+            {
+                return vec_cts(self.data, 0);
+            }
+
+            template <class A>
+            XSIMD_INLINE batch<uint32_t, A> fast_cast(batch<float, A> const& self, batch<uint32_t, A> const&, requires_arch<vsx>) noexcept
+            {
+                return vec_ctu(self.data, 0);
+            }
+        }
+
+        // eq
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<vsx>) noexcept
+        {
+            auto res = vec_cmpeq(self.data, other.data);
+            return *reinterpret_cast<typename batch_bool<T, A>::register_type*>(&res);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<vsx>) noexcept
+        {
+            auto res = vec_cmpeq(self.data, other.data);
+            return *reinterpret_cast<typename batch_bool<T, A>::register_type*>(&res);
+        }
+
+        // first
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        XSIMD_INLINE T first(batch<T, A> const& self, requires_arch<vsx>) noexcept
+        {
+            return vec_extract(self.data, 0);
+        }
+
+        // ge
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> ge(batch<T, A> const& self, batch<T, A> const& other, requires_arch<vsx>) noexcept
+        {
+            return vec_cmpge(self.data, other.data);
+        }
+
+        // gt
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<vsx>) noexcept
+        {
+            return vec_cmpgt(self.data, other.data);
+        }
+
+        // haddp
+        template <class A>
+        XSIMD_INLINE batch<float, A> haddp(batch<float, A> const* row, requires_arch<vsx>) noexcept
+        {
+            auto tmp0 = vec_mergee(row[0].data, row[1].data); // v00 v10 v02 v12
+            auto tmp1 = vec_mergeo(row[0].data, row[1].data); // v01 v11 v03 v13
+            auto tmp4 = vec_add(tmp0, tmp1); // (v00 + v01, v10 + v11, v02 + v03, v12 + v13)
+
+            auto tmp2 = vec_mergee(row[2].data, row[3].data); // v20 v30 v22 v32
+            auto tmp3 = vec_mergeo(row[2].data, row[3].data); // v21 v31 v23 v33
+            auto tmp5 = vec_add(tmp0, tmp1); // (v20 + v21, v30 + v31, v22 + v23, v32 + v33)
+
+            auto tmp6 = vec_perm(tmp4, tmp5, (__vector unsigned char) { 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 }); // (v00 + v01, v10 + v11, v20 + v21, v30 + v31
+            auto tmp7 = vec_perm(tmp4, tmp5, (__vector unsigned char) { 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31 }); // (v02 + v03, v12 + v13, v12 + v13, v32 + v33)
+
+            return vec_add(tmp6, tmp7);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> haddp(batch<double, A> const* row, requires_arch<vsx>) noexcept
+        {
+            auto tmp0 = vec_mergee(row[0].data, row[1].data); // v00 v10 v02 v12
+            auto tmp1 = vec_mergeo(row[0].data, row[1].data); // v01 v11 v03 v13
+            return vec_add(tmp0, tmp1);
+        }
+
+        // incr_if
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> incr_if(batch<T, A> const& self, batch_bool<T, A> const& mask, requires_arch<vsx>) noexcept
+        {
+            return self - batch<T, A>((typename batch<T, A>::register_type)mask.data);
+        }
+
+        // insert
+        template <class A, class T, size_t I, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<vsx>) noexcept
+        {
+            return vec_insert(val, self.data, I);
+        }
+
+        // isnan
+        template <class A>
+        XSIMD_INLINE batch_bool<float, A> isnan(batch<float, A> const& self, requires_arch<vsx>) noexcept
+        {
+            return ~vec_cmpeq(self.data, self.data);
+        }
+        template <class A>
+        XSIMD_INLINE batch_bool<double, A> isnan(batch<double, A> const& self, requires_arch<vsx>) noexcept
+        {
+            return ~vec_cmpeq(self.data, self.data);
+        }
+
+        // load_aligned
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> load_aligned(T const* mem, convert<T>, requires_arch<vsx>) noexcept
+        {
+            return vec_ld(0, reinterpret_cast<const typename batch<T, A>::register_type*>(mem));
+        }
+
+        // load_unaligned
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<vsx>) noexcept
+        {
+            return vec_vsx_ld(0, (typename batch<T, A>::register_type const*)mem);
+        }
+
+        // load_complex
+        namespace detail
+        {
+            template <class A>
+            XSIMD_INLINE batch<std::complex<float>, A> load_complex(batch<float, A> const& hi, batch<float, A> const& lo, requires_arch<vsx>) noexcept
+            {
+                __vector unsigned char perme = { 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 };
+                __vector unsigned char permo = { 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 };
+                return { vec_perm(hi.data, lo.data, perme), vec_perm(hi.data, lo.data, permo) };
+            }
+            template <class A>
+            XSIMD_INLINE batch<std::complex<double>, A> load_complex(batch<double, A> const& hi, batch<double, A> const& lo, requires_arch<vsx>) noexcept
+            {
+                return { vec_mergee(hi.data, lo.data), vec_mergeo(hi.data, lo.data) };
+            }
+        }
+
+        // le
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& self, batch<T, A> const& other, requires_arch<vsx>) noexcept
+        {
+            return vec_cmple(self.data, other.data);
+        }
+
+        // lt
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<vsx>) noexcept
+        {
+            return vec_cmplt(self.data, other.data);
+        }
+
+        // max
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<vsx>) noexcept
+        {
+            return vec_max(self.data, other.data);
+        }
+
+        // min
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<vsx>) noexcept
+        {
+            return vec_min(self.data, other.data);
+        }
+
+        // mul
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<vsx>) noexcept
+        {
+            return self.data * other.data;
+            // return vec_mul(self.data, other.data);
+        }
+
+        // neg
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& self, requires_arch<vsx>) noexcept
+        {
+            return -(self.data);
+        }
+
+        // neq
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<vsx>) noexcept
+        {
+            return ~vec_cmpeq(self.data, other.data);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> neq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<vsx>) noexcept
+        {
+            return ~vec_cmpeq(self.data, other.data);
+        }
+
+        // reciprocal
+        template <class A>
+        XSIMD_INLINE batch<float, A> reciprocal(batch<float, A> const& self,
+                                                kernel::requires_arch<vsx>)
+        {
+            return vec_re(self.data);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> reciprocal(batch<double, A> const& self,
+                                                 kernel::requires_arch<vsx>)
+        {
+            return vec_re(self.data);
+        }
+
+        // reduce_add
+        template <class A>
+        XSIMD_INLINE signed reduce_add(batch<signed, A> const& self, requires_arch<vsx>) noexcept
+        {
+            auto tmp0 = vec_reve(self.data); // v3, v2, v1, v0
+            auto tmp1 = vec_add(self.data, tmp0); // v0 + v3, v1 + v2, v2 + v1, v3 + v0
+            auto tmp2 = vec_mergel(tmp1, tmp1); // v2 + v1, v2 + v1, v3 + v0, v3 + v0
+            auto tmp3 = vec_add(tmp1, tmp2);
+            return vec_extract(tmp3, 0);
+        }
+        template <class A>
+        XSIMD_INLINE unsigned reduce_add(batch<unsigned, A> const& self, requires_arch<vsx>) noexcept
+        {
+            auto tmp0 = vec_reve(self.data); // v3, v2, v1, v0
+            auto tmp1 = vec_add(self.data, tmp0); // v0 + v3, v1 + v2, v2 + v1, v3 + v0
+            auto tmp2 = vec_mergel(tmp1, tmp1); // v2 + v1, v2 + v1, v3 + v0, v3 + v0
+            auto tmp3 = vec_add(tmp1, tmp2);
+            return vec_extract(tmp3, 0);
+        }
+        template <class A>
+        XSIMD_INLINE float reduce_add(batch<float, A> const& self, requires_arch<vsx>) noexcept
+        {
+            // FIXME: find an in-order approach
+            auto tmp0 = vec_reve(self.data); // v3, v2, v1, v0
+            auto tmp1 = vec_add(self.data, tmp0); // v0 + v3, v1 + v2, v2 + v1, v3 + v0
+            auto tmp2 = vec_mergel(tmp1, tmp1); // v2 + v1, v2 + v1, v3 + v0, v3 + v0
+            auto tmp3 = vec_add(tmp1, tmp2);
+            return vec_extract(tmp3, 0);
+        }
+        template <class A>
+        XSIMD_INLINE double reduce_add(batch<double, A> const& self, requires_arch<vsx>) noexcept
+        {
+            auto tmp0 = vec_reve(self.data); // v1, v0
+            auto tmp1 = vec_add(self.data, tmp0); // v0 + v1, v1 + v0
+            return vec_extract(tmp1, 0);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<vsx>) noexcept
+        {
+            return hadd(self, common {});
+        }
+
+        // rsqrt
+        template <class A>
+        XSIMD_INLINE batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<vsx>) noexcept
+        {
+            return vec_rsqrt(val.data);
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> rsqrt(batch<double, A> const& val, requires_arch<vsx>) noexcept
+        {
+            return vec_rsqrt(val.data);
+        }
+
+        // select
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<vsx>) noexcept
+        {
+            return vec_sel(false_br.data, true_br.data, cond.data);
+        }
+        template <class A, class T, bool... Values, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<vsx>) noexcept
+        {
+            return select(batch_bool<T, A> { Values... }, true_br, false_br, vsx {});
+        }
+
+        // shuffle
+        template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3>
+        XSIMD_INLINE batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y, batch_constant<ITy, A, I0, I1, I2, I3>, requires_arch<vsx>) noexcept
+        {
+            return vec_perm(x.data, y.data,
+                            (__vector unsigned char) {
+                                4 * I0 + 0, 4 * I0 + 1, 4 * I0 + 2, 4 * I0 + 3,
+                                4 * I1 + 0, 4 * I1 + 1, 4 * I1 + 2, 4 * I1 + 3,
+                                4 * I2 + 0, 4 * I2 + 1, 4 * I2 + 2, 4 * I2 + 3,
+                                4 * I3 + 0, 4 * I3 + 1, 4 * I3 + 2, 4 * I3 + 3 });
+        }
+
+        template <class A, class ITy, ITy I0, ITy I1>
+        XSIMD_INLINE batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<ITy, A, I0, I1>, requires_arch<vsx>) noexcept
+        {
+            return vec_perm(x.data, y.data,
+                            (__vector unsigned char) {
+                                8 * I0 + 0,
+                                8 * I0 + 1,
+                                8 * I0 + 2,
+                                8 * I0 + 3,
+                                8 * I0 + 4,
+                                8 * I0 + 5,
+                                8 * I0 + 6,
+                                8 * I0 + 7,
+                                8 * I1 + 0,
+                                8 * I1 + 1,
+                                8 * I1 + 2,
+                                8 * I1 + 3,
+                                8 * I1 + 4,
+                                8 * I1 + 5,
+                                8 * I1 + 6,
+                                8 * I1 + 7,
+                            });
+        }
+
+        // sqrt
+        template <class A>
+        XSIMD_INLINE batch<float, A> sqrt(batch<float, A> const& val, requires_arch<vsx>) noexcept
+        {
+            return vec_sqrt(val.data);
+        }
+
+        template <class A>
+        XSIMD_INLINE batch<double, A> sqrt(batch<double, A> const& val, requires_arch<vsx>) noexcept
+        {
+            return vec_sqrt(val.data);
+        }
+
+        // slide_left
+        template <size_t N, class A, class T>
+        XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& x, requires_arch<vsx>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(N == batch<T, A>::size * sizeof(T))
+            {
+                return batch<T, A>(0);
+            }
+            else
+            {
+                auto slider = vec_splats((uint8_t)(8 * N));
+                return (typename batch<T, A>::register_type)vec_slo(x.data, slider);
+            }
+        }
+
+        // slide_right
+        template <size_t N, class A, class T>
+        XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& x, requires_arch<vsx>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(N == batch<T, A>::size * sizeof(T))
+            {
+                return batch<T, A>(0);
+            }
+            else
+            {
+                auto slider = vec_splats((uint8_t)(8 * N));
+                return (typename batch<T, A>::register_type)vec_sro((__vector unsigned char)x.data, slider);
+            }
+        }
+
+        // sadd
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value && sizeof(T) != 8, void>::type>
+        XSIMD_INLINE batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<vsx>) noexcept
+        {
+            return vec_adds(self.data, other.data);
+        }
+
+        // set
+        template <class A, class T, class... Values>
+        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<vsx>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch<T, A>::size, "consistent init");
+            return typename batch<T, A>::register_type { values... };
+        }
+
+        template <class A, class T, class... Values, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        XSIMD_INLINE batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<vsx>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch_bool<T, A>::size, "consistent init");
+            return typename batch_bool<T, A>::register_type { static_cast<decltype(std::declval<typename batch_bool<T, A>::register_type>()[0])>(values ? -1LL : 0LL)... };
+        }
+
+        // ssub
+
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value && sizeof(T) == 1, void>::type>
+        XSIMD_INLINE batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<vsx>) noexcept
+        {
+            return vec_subs(self.data, other.data);
+        }
+
+        // store_aligned
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        XSIMD_INLINE void store_aligned(T* mem, batch<T, A> const& self, requires_arch<vsx>) noexcept
+        {
+            return vec_st(self.data, 0, reinterpret_cast<typename batch<T, A>::register_type*>(mem));
+        }
+
+        // store_unaligned
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        XSIMD_INLINE void store_unaligned(T* mem, batch<T, A> const& self, requires_arch<vsx>) noexcept
+        {
+            return vec_vsx_st(self.data, 0, reinterpret_cast<typename batch<T, A>::register_type*>(mem));
+        }
+
+        // sub
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<vsx>) noexcept
+        {
+            return vec_sub(self.data, other.data);
+        }
+
+        // swizzle
+
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
+        XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3>, requires_arch<vsx>) noexcept
+        {
+            return vec_perm(self.data, self.data,
+                            (__vector unsigned char) {
+                                4 * V0 + 0, 4 * V0 + 1, 4 * V0 + 2, 4 * V0 + 3,
+                                4 * V1 + 0, 4 * V1 + 1, 4 * V1 + 2, 4 * V1 + 3,
+                                4 * V2 + 0, 4 * V2 + 1, 4 * V2 + 2, 4 * V2 + 3,
+                                4 * V3 + 0, 4 * V3 + 1, 4 * V3 + 2, 4 * V3 + 3 });
+        }
+
+        template <class A, uint64_t V0, uint64_t V1>
+        XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1>, requires_arch<vsx>) noexcept
+        {
+            return vec_perm(self.data, self.data,
+                            (__vector unsigned char) {
+                                8 * V0 + 0,
+                                8 * V0 + 1,
+                                8 * V0 + 2,
+                                8 * V0 + 3,
+                                8 * V0 + 4,
+                                8 * V0 + 5,
+                                8 * V0 + 6,
+                                8 * V0 + 7,
+                                8 * V1 + 0,
+                                8 * V1 + 1,
+                                8 * V1 + 2,
+                                8 * V1 + 3,
+                                8 * V1 + 4,
+                                8 * V1 + 5,
+                                8 * V1 + 6,
+                                8 * V1 + 7,
+                            });
+        }
+
+        template <class A, uint64_t V0, uint64_t V1>
+        XSIMD_INLINE batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<uint64_t, A, V0, V1>, requires_arch<vsx>) noexcept
+        {
+            return vec_perm(self.data, self.data,
+                            (__vector unsigned char) {
+                                8 * V0 + 0,
+                                8 * V0 + 1,
+                                8 * V0 + 2,
+                                8 * V0 + 3,
+                                8 * V0 + 4,
+                                8 * V0 + 5,
+                                8 * V0 + 6,
+                                8 * V0 + 7,
+                                8 * V1 + 0,
+                                8 * V1 + 1,
+                                8 * V1 + 2,
+                                8 * V1 + 3,
+                                8 * V1 + 4,
+                                8 * V1 + 5,
+                                8 * V1 + 6,
+                                8 * V1 + 7,
+                            });
+        }
+
+        template <class A, uint64_t V0, uint64_t V1>
+        XSIMD_INLINE batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<uint64_t, A, V0, V1> mask, requires_arch<vsx>) noexcept
+        {
+            return bitwise_cast<int64_t>(swizzle(bitwise_cast<uint64_t>(self), mask, vsx {}));
+        }
+
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
+        XSIMD_INLINE batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3>, requires_arch<vsx>) noexcept
+        {
+            return vec_perm(self.data, self.data,
+                            (__vector unsigned char) {
+                                4 * V0 + 0, 4 * V0 + 1, 4 * V0 + 2, 4 * V0 + 3,
+                                4 * V1 + 0, 4 * V1 + 1, 4 * V1 + 2, 4 * V1 + 3,
+                                4 * V2 + 0, 4 * V2 + 1, 4 * V2 + 2, 4 * V2 + 3,
+                                4 * V3 + 0, 4 * V3 + 1, 4 * V3 + 2, 4 * V3 + 3 });
+        }
+
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
+        XSIMD_INLINE batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3> mask, requires_arch<vsx>) noexcept
+        {
+            return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, vsx {}));
+        }
+
+        template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
+        XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<vsx>) noexcept
+        {
+            return vec_perm(self.data, self.data,
+                            (__vector unsigned char) {
+                                2 * V0 + 0, 2 * V0 + 1, 2 * V1 + 0, 2 * V1 + 1,
+                                2 * V2 + 0, 2 * V2 + 1, 2 * V3 + 0, 2 * V3 + 1,
+                                2 * V4 + 0, 2 * V4 + 1, 2 * V5 + 0, 2 * V5 + 1,
+                                2 * V6 + 0, 2 * V6 + 1, 2 * V7 + 0, 2 * V7 + 1 });
+        }
+
+        template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
+        XSIMD_INLINE batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<vsx>) noexcept
+        {
+            return bitwise_cast<int16_t>(swizzle(bitwise_cast<uint16_t>(self), mask, vsx {}));
+        }
+
+        // zip_hi
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<vsx>) noexcept
+        {
+            return vec_mergel(self.data, other.data);
+        }
+
+        // zip_lo
+        template <class A, class T, class = typename std::enable_if<std::is_scalar<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<vsx>) noexcept
+        {
+            return vec_mergeh(self.data, other.data);
+        }
+    }
+}
+
+#endif
diff --git a/include/xsimd/config/xsimd_arch.hpp b/include/xsimd/config/xsimd_arch.hpp
index 89fc6783d..ddd78e13d 100644
--- a/include/xsimd/config/xsimd_arch.hpp
+++ b/include/xsimd/config/xsimd_arch.hpp
@@ -169,14 +169,16 @@ namespace xsimd
     using all_sve_architectures = arch_list<detail::sve<512>, detail::sve<256>, detail::sve<128>>;
     using all_rvv_architectures = arch_list<detail::rvv<512>, detail::rvv<256>, detail::rvv<128>>;
     using all_arm_architectures = typename detail::join<all_sve_architectures, arch_list<i8mm<neon64>, neon64, neon>>::type;
+    using all_power_architectures = arch_list<vsx>;
     using all_riscv_architectures = all_rvv_architectures;
     using all_wasm_architectures = arch_list<wasm>;
-    using all_architectures = typename detail::join<all_riscv_architectures, all_wasm_architectures, all_arm_architectures, all_x86_architectures>::type;
+    using all_architectures = typename detail::join<all_power_architectures, all_riscv_architectures, all_wasm_architectures, all_arm_architectures, all_x86_architectures>::type;
 
     using supported_architectures = typename detail::supported<all_architectures>::type;
 
     using x86_arch = typename detail::supported<all_x86_architectures>::type::best;
     using arm_arch = typename detail::supported<all_arm_architectures>::type::best;
+    using power_arch = typename detail::supported<all_power_architectures>::type::best;
     using riscv_arch = typename detail::supported<all_riscv_architectures>::type::best;
     using best_arch = typename supported_architectures::best;
 
diff --git a/include/xsimd/config/xsimd_config.hpp b/include/xsimd/config/xsimd_config.hpp
index 326f766c4..70d065292 100644
--- a/include/xsimd/config/xsimd_config.hpp
+++ b/include/xsimd/config/xsimd_config.hpp
@@ -408,6 +408,17 @@
 #define XSIMD_WITH_WASM 0
 #endif
 
+/**
+ * @ingroup xsimd_config_macro
+ *
+ * Set to 1 if VMX with VSX extension is available at compile-time, to 0 otherwise.
+ */
+#if defined(__VEC__) && defined(__VSX__)
+#define XSIMD_WITH_VSX 1
+#else
+#define XSIMD_WITH_VSX 0
+#endif
+
 // Workaround for MSVC compiler
 #ifdef _MSC_VER
 
@@ -466,7 +477,7 @@
 
 #endif
 
-#if !XSIMD_WITH_SSE2 && !XSIMD_WITH_SSE3 && !XSIMD_WITH_SSSE3 && !XSIMD_WITH_SSE4_1 && !XSIMD_WITH_SSE4_2 && !XSIMD_WITH_AVX && !XSIMD_WITH_AVX2 && !XSIMD_WITH_AVXVNNI && !XSIMD_WITH_FMA3_SSE && !XSIMD_WITH_FMA4 && !XSIMD_WITH_FMA3_AVX && !XSIMD_WITH_FMA3_AVX2 && !XSIMD_WITH_AVX512F && !XSIMD_WITH_AVX512CD && !XSIMD_WITH_AVX512DQ && !XSIMD_WITH_AVX512BW && !XSIMD_WITH_AVX512ER && !XSIMD_WITH_AVX512PF && !XSIMD_WITH_AVX512IFMA && !XSIMD_WITH_AVX512VBMI && !XSIMD_WITH_AVX512VBMI2 && !XSIMD_WITH_NEON && !XSIMD_WITH_NEON64 && !XSIMD_WITH_SVE && !XSIMD_WITH_RVV && !XSIMD_WITH_WASM && !XSIMD_WITH_EMULATED
+#if !XSIMD_WITH_SSE2 && !XSIMD_WITH_SSE3 && !XSIMD_WITH_SSSE3 && !XSIMD_WITH_SSE4_1 && !XSIMD_WITH_SSE4_2 && !XSIMD_WITH_AVX && !XSIMD_WITH_AVX2 && !XSIMD_WITH_AVXVNNI && !XSIMD_WITH_FMA3_SSE && !XSIMD_WITH_FMA4 && !XSIMD_WITH_FMA3_AVX && !XSIMD_WITH_FMA3_AVX2 && !XSIMD_WITH_AVX512F && !XSIMD_WITH_AVX512CD && !XSIMD_WITH_AVX512DQ && !XSIMD_WITH_AVX512BW && !XSIMD_WITH_AVX512ER && !XSIMD_WITH_AVX512PF && !XSIMD_WITH_AVX512IFMA && !XSIMD_WITH_AVX512VBMI && !XSIMD_WITH_AVX512VBMI2 && !XSIMD_WITH_NEON && !XSIMD_WITH_NEON64 && !XSIMD_WITH_SVE && !XSIMD_WITH_RVV && !XSIMD_WITH_WASM && !XSIMD_WITH_VSX && !XSIMD_WITH_EMULATED
 #define XSIMD_NO_SUPPORTED_ARCHITECTURE
 #endif
 
diff --git a/include/xsimd/config/xsimd_cpuid.hpp b/include/xsimd/config/xsimd_cpuid.hpp
index 7b940f655..8fdae0e51 100644
--- a/include/xsimd/config/xsimd_cpuid.hpp
+++ b/include/xsimd/config/xsimd_cpuid.hpp
@@ -84,6 +84,7 @@ namespace xsimd
             ARCH_FIELD_EX_REUSE(detail::rvv<256>, rvv)
             ARCH_FIELD_EX_REUSE(detail::rvv<128>, rvv)
             ARCH_FIELD(wasm)
+            ARCH_FIELD(vsx)
 
 #undef ARCH_FIELD
 
@@ -95,6 +96,10 @@ namespace xsimd
                 wasm = 1;
 #endif
 
+#if XSIMD_WITH_VSX
+                vsx = 1;
+#endif
+
 #if defined(__aarch64__) || defined(_M_ARM64)
                 neon = 1;
                 neon64 = 1;
diff --git a/include/xsimd/config/xsimd_inline.hpp b/include/xsimd/config/xsimd_inline.hpp
index a87410b41..eaf024103 100644
--- a/include/xsimd/config/xsimd_inline.hpp
+++ b/include/xsimd/config/xsimd_inline.hpp
@@ -12,7 +12,9 @@
 #ifndef XSIMD_INLINE_HPP
 #define XSIMD_INLINE_HPP
 
-#if defined __has_attribute
+#if defined(__VEC__)
+#define XSIMD_INLINE inline
+#elif defined __has_attribute
 #if __has_attribute(always_inline)
 #define XSIMD_INLINE inline __attribute__((always_inline))
 #endif
diff --git a/include/xsimd/types/xsimd_all_registers.hpp b/include/xsimd/types/xsimd_all_registers.hpp
index a652061a8..33f9b465d 100644
--- a/include/xsimd/types/xsimd_all_registers.hpp
+++ b/include/xsimd/types/xsimd_all_registers.hpp
@@ -48,6 +48,8 @@
 
 #include "xsimd_wasm_register.hpp"
 
+#include "xsimd_vsx_register.hpp"
+
 #if XSIMD_WITH_EMULATED
 #include "xsimd_emulated_register.hpp"
 #endif
diff --git a/include/xsimd/types/xsimd_api.hpp b/include/xsimd/types/xsimd_api.hpp
index 3b080dd95..eeee5b267 100644
--- a/include/xsimd/types/xsimd_api.hpp
+++ b/include/xsimd/types/xsimd_api.hpp
@@ -2206,6 +2206,9 @@ namespace xsimd
      * Slide the whole batch to the left by \c n bytes. This is different from
      * \c bitwise_lshift that shifts each batch element to the left.
      *
+     * @warning The behavior of this function is platform-dependent on big
+     * endian architectures.
+     *
      * @tparam N Amount of bytes to slide to the left.
      * @param x batch of integer values.
      * @return slided batch.
@@ -2224,6 +2227,9 @@ namespace xsimd
      * Slide the whole batch to the right by \c N bytes. This is different from
      * \c bitwise_rshift that shifts each batch element to the right.
      *
+     * @warning The behavior of this function is platform-dependent on big
+     * endian architectures.
+     *
      * @tparam N Amount of bytes to slide to the right.
      * @param x batch of integer values.
      * @return slided batch.
diff --git a/include/xsimd/types/xsimd_vsx_register.hpp b/include/xsimd/types/xsimd_vsx_register.hpp
new file mode 100644
index 000000000..cfd450317
--- /dev/null
+++ b/include/xsimd/types/xsimd_vsx_register.hpp
@@ -0,0 +1,77 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_VSX_REGISTER_HPP
+#define XSIMD_VSX_REGISTER_HPP
+
+#include "./xsimd_common_arch.hpp"
+#include "./xsimd_register.hpp"
+
+#if XSIMD_WITH_VSX
+#include <altivec.h>
+#endif
+
+namespace xsimd
+{
+    /**
+     * @ingroup architectures
+     *
+     * VSX instructions
+     */
+    struct vsx : common
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_VSX; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr bool requires_alignment() noexcept { return true; }
+        static constexpr std::size_t alignment() noexcept { return 16; }
+        static constexpr char const* name() noexcept { return "vmx+vsx"; }
+    };
+
+#if XSIMD_WITH_VSX
+    namespace types
+    {
+
+#define XSIMD_DECLARE_SIMD_BOOL_VSX_REGISTER(T, Tb)                  \
+    template <>                                                      \
+    struct get_bool_simd_register<T, vsx>                            \
+    {                                                                \
+        struct type                                                  \
+        {                                                            \
+            using register_type = __vector __bool Tb;                \
+            register_type data;                                      \
+            type() = default;                                        \
+            type(register_type r)                                    \
+                : data(r)                                            \
+            {                                                        \
+            }                                                        \
+            operator register_type() const noexcept { return data; } \
+        };                                                           \
+    };                                                               \
+    XSIMD_DECLARE_SIMD_REGISTER(T, vsx, __vector T)
+
+        XSIMD_DECLARE_SIMD_BOOL_VSX_REGISTER(signed char, char);
+        XSIMD_DECLARE_SIMD_BOOL_VSX_REGISTER(unsigned char, char);
+        XSIMD_DECLARE_SIMD_BOOL_VSX_REGISTER(char, char);
+        XSIMD_DECLARE_SIMD_BOOL_VSX_REGISTER(unsigned short, short);
+        XSIMD_DECLARE_SIMD_BOOL_VSX_REGISTER(short, short);
+        XSIMD_DECLARE_SIMD_BOOL_VSX_REGISTER(unsigned int, int);
+        XSIMD_DECLARE_SIMD_BOOL_VSX_REGISTER(int, int);
+        XSIMD_DECLARE_SIMD_BOOL_VSX_REGISTER(unsigned long, long);
+        XSIMD_DECLARE_SIMD_BOOL_VSX_REGISTER(long, long);
+        XSIMD_DECLARE_SIMD_BOOL_VSX_REGISTER(float, int);
+        XSIMD_DECLARE_SIMD_BOOL_VSX_REGISTER(double, long);
+
+#undef XSIMD_DECLARE_SIMD_BOOL_VSX_REGISTER
+    }
+#endif
+}
+
+#endif
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 8a4ce50d5..e6bad7999 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -107,6 +107,8 @@ if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID MATCHES "GNU"
         set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mcpu=${TARGET_ARCH} -mtune=${TARGET_ARCH}")
     elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "riscv64")
         # Nothing specific
+    elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "powerpc")
+        # Nothing specific
     elseif(NOT WIN32 AND NOT EMSCRIPTEN)
         if(NOT CMAKE_CXX_FLAGS MATCHES "-march" AND NOT CMAKE_CXX_FLAGS MATCHES "-arch" AND NOT CMAKE_OSX_ARCHITECTURES)
             set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=${TARGET_ARCH}")
diff --git a/test/test_batch_int.cpp b/test/test_batch_int.cpp
index 9a992a566..3b58e0548 100644
--- a/test/test_batch_int.cpp
+++ b/test/test_batch_int.cpp
@@ -285,21 +285,15 @@ struct batch_int_test
         for (int32_t i = 0; i < s; ++i)
         {
             res = lhs << i;
-            value_type expected = value_type(1) << i;
-            for (std::size_t j = 0; j < size; ++j)
-            {
-                CHECK_EQ(res.get(j), expected);
-            }
+            batch_type expected(value_type(1) << i);
+            CHECK_BATCH_EQ(res, expected);
         }
         lhs = batch_type(std::numeric_limits<value_type>::max());
         for (int32_t i = 0; i < s; ++i)
         {
             res = lhs >> i;
-            value_type expected = std::numeric_limits<value_type>::max() >> i;
-            for (std::size_t j = 0; j < size; ++j)
-            {
-                CHECK_EQ(res.get(j), expected);
-            }
+            batch_type expected(std::numeric_limits<value_type>::max() >> i);
+            CHECK_BATCH_EQ(res, expected);
         }
     }
 
diff --git a/test/test_load_store.cpp b/test/test_load_store.cpp
index 7a4d80932..449c41e85 100644
--- a/test/test_load_store.cpp
+++ b/test/test_load_store.cpp
@@ -39,7 +39,9 @@ struct load_store_test
     using ulong_vector_type = std::vector<unsigned long, allocator<unsigned long>>;
 #endif
     using float_vector_type = std::vector<float, allocator<float>>;
+#if !XSIMD_WITH_NEON || XSIMD_WITH_NEON64
     using double_vector_type = std::vector<double, allocator<double>>;
+#endif
 
     int8_vector_type i8_vec;
     uint8_vector_type ui8_vec;
@@ -54,7 +56,9 @@ struct load_store_test
     ulong_vector_type ul_vec;
 #endif
     float_vector_type f_vec;
+#if !XSIMD_WITH_NEON || XSIMD_WITH_NEON64
     double_vector_type d_vec;
+#endif
 
     array_type expected;
 
@@ -73,7 +77,9 @@ struct load_store_test
         init_test_vector(ul_vec);
 #endif
         init_test_vector(f_vec);
+#if !XSIMD_WITH_NEON || XSIMD_WITH_NEON64
         init_test_vector(d_vec);
+#endif
     }
 
     void test_load()
diff --git a/test/test_memory.cpp b/test/test_memory.cpp
index 930ef26fd..f582d59bd 100644
--- a/test/test_memory.cpp
+++ b/test/test_memory.cpp
@@ -46,7 +46,7 @@ TEST_CASE("[is_aligned]")
     float f[100];
     void* unaligned_f = static_cast<void*>(&f[0]);
     constexpr std::size_t alignment = xsimd::default_arch::alignment();
-    std::size_t aligned_f_size;
+    std::size_t aligned_f_size = sizeof(f);
     void* aligned_f = std::align(alignment, sizeof(f), unaligned_f, aligned_f_size);
     CHECK_UNARY(xsimd::is_aligned(aligned_f));
 
diff --git a/test/test_shuffle.cpp b/test/test_shuffle.cpp
index 846da5b7d..b08210974 100644
--- a/test/test_shuffle.cpp
+++ b/test/test_shuffle.cpp
@@ -14,6 +14,13 @@
 
 #include "test_utils.hpp"
 
+#ifdef __linux__
+#include "endian.h"
+#if BYTE_ORDER == BIG_ENDIAN
+#define XSIMD_NO_SLIDE
+#endif
+#endif
+
 #include <numeric>
 
 namespace
@@ -197,6 +204,7 @@ struct slide_test : public init_slide_base<typename B::value_type, B::size>
         INFO("slide_left full");
         CHECK_BATCH_EQ(b_res_left_full, b_left_full);
 
+#ifndef XSIMD_NO_SLIDE
         B b_res_left_half = xsimd::slide_left<half_slide>(b_in);
         INFO("slide_left half_slide");
         CHECK_BATCH_EQ(b_res_left_half, b_left_half);
@@ -215,6 +223,7 @@ struct slide_test : public init_slide_base<typename B::value_type, B::size>
             INFO("slide_left below_half_slide");
             CHECK_BATCH_EQ(b_res_left_below_half, b_left_below_half);
         }
+#endif
     }
 
     void slide_right()
@@ -235,6 +244,7 @@ struct slide_test : public init_slide_base<typename B::value_type, B::size>
         INFO("slide_right full");
         CHECK_BATCH_EQ(b_res_right_full, b_right_full);
 
+#ifndef XSIMD_NO_SLIDE
         B b_res_right_half = xsimd::slide_right<half_slide>(b_in);
         INFO("slide_right half_slide");
         CHECK_BATCH_EQ(b_res_right_half, b_right_half);
@@ -253,6 +263,7 @@ struct slide_test : public init_slide_base<typename B::value_type, B::size>
             INFO("slide_right below_half_slide");
             CHECK_BATCH_EQ(b_res_right_below_half, b_right_below_half);
         }
+#endif
     }
 };
 
@@ -344,7 +355,9 @@ struct compress_test
     }
 };
 
-TEST_CASE_TEMPLATE("[compress]", B, BATCH_FLOAT_TYPES, xsimd::batch<uint8_t>, xsimd::batch<int8_t>, xsimd::batch<uint16_t>, xsimd::batch<int16_t>, xsimd::batch<uint32_t>, xsimd::batch<int32_t>, xsimd::batch<uint64_t>, xsimd::batch<int64_t>)
+#define XSIMD_COMPRESS_TYPES BATCH_FLOAT_TYPES, xsimd::batch<uint8_t>, xsimd::batch<int8_t>, xsimd::batch<uint16_t>, xsimd::batch<int16_t>, xsimd::batch<uint32_t>, xsimd::batch<int32_t>, xsimd::batch<uint64_t>, xsimd::batch<int64_t>
+
+TEST_CASE_TEMPLATE("[compress]", B, XSIMD_COMPRESS_TYPES)
 {
     compress_test<B> Test;
     SUBCASE("empty")
@@ -440,7 +453,9 @@ struct expand_test
     }
 };
 
-TEST_CASE_TEMPLATE("[expand]", B, BATCH_FLOAT_TYPES, xsimd::batch<uint8_t>, xsimd::batch<int8_t>, xsimd::batch<uint16_t>, xsimd::batch<int16_t>, xsimd::batch<uint32_t>, xsimd::batch<int32_t>, xsimd::batch<uint64_t>, xsimd::batch<int64_t>)
+#define XSIMD_EXPAND_TYPES XSIMD_COMPRESS_TYPES
+
+TEST_CASE_TEMPLATE("[expand]", B, XSIMD_EXPAND_TYPES)
 {
     expand_test<B> Test;
     SUBCASE("empty")
@@ -687,7 +702,9 @@ struct shuffle_test
     }
 };
 
-TEST_CASE_TEMPLATE("[shuffle]", B, BATCH_FLOAT_TYPES, xsimd::batch<uint32_t>, xsimd::batch<int32_t>, xsimd::batch<uint64_t>, xsimd::batch<int64_t>)
+#define XSIMD_SHUFFLE_TYPES BATCH_FLOAT_TYPES, xsimd::batch<uint32_t>, xsimd::batch<int32_t>, xsimd::batch<uint64_t>, xsimd::batch<int64_t>
+
+TEST_CASE_TEMPLATE("[shuffle]", B, XSIMD_SHUFFLE_TYPES)
 {
     shuffle_test<B> Test;
     SUBCASE("no-op")