diff --git a/.github/toolchains/gcc-s390x-linux-gnu.cmake b/.github/toolchains/gcc-s390x-linux-gnu.cmake
new file mode 100644
index 000000000..05fba0b53
--- /dev/null
+++ b/.github/toolchains/gcc-s390x-linux-gnu.cmake
@@ -0,0 +1,4 @@
+set(CMAKE_SYSTEM_PROCESSOR s390x)
+set(triple s390x-linux-gnu)
+
+include(${CMAKE_CURRENT_LIST_DIR}/gcc.cmake)
diff --git a/.github/workflows/cross-s390x.yml b/.github/workflows/cross-s390x.yml
new file mode 100644
index 000000000..2f98aae77
--- /dev/null
+++ b/.github/workflows/cross-s390x.yml
@@ -0,0 +1,48 @@
+name: IBM Z cross-compilation build
+on: [push, pull_request]
+concurrency:
+  group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+    name: '${{ matrix.target.platform }}, ${{ matrix.sys.compiler }} ${{ matrix.sys.version }}'
+    strategy:
+      matrix:
+        target:
+          - { platform: 's390x',     dir: 's390x-linux-gnu',   flags: '-mzvector -march=z14', full: 'OFF' }
+        sys:
+          - { compiler: 'gcc',   version: '14' }
+    steps:
+    - name: Setup compiler
+      if: ${{ matrix.sys.compiler == 'gcc' }}
+      run: |
+        sudo apt-get update || exit 1
+        sudo apt-get -y --no-install-suggests --no-install-recommends install g++-${{ matrix.sys.version }}-${{ matrix.target.dir }} g++-${{ matrix.sys.version }}-multilib cmake || exit 1
+        sudo update-alternatives --remove-all ${{ matrix.target.dir }}-gcc || true
+        sudo update-alternatives --remove-all ${{ matrix.target.dir }}-g++ || true
+        sudo update-alternatives --install /usr/bin/${{ matrix.target.dir }}-gcc ${{ matrix.target.dir }}-gcc /usr/bin/${{ matrix.target.dir }}-gcc-${{ matrix.sys.version }} 20
+        sudo update-alternatives --install /usr/bin/${{ matrix.target.dir }}-g++ ${{ matrix.target.dir }}-g++ /usr/bin/${{ matrix.target.dir }}-g++-${{ matrix.sys.version }} 20
+    - name: Setup QEMU
+      run: |
+        sudo apt-get --no-install-suggests --no-install-recommends install qemu-user
+    - name: Setup Ninja
+      run: |
+        sudo apt-get install ninja-build
+    - name: Checkout xsimd
+      uses: actions/checkout@v6
+    - name: Setup
+      run: |
+        cmake -B _build \
+              -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON \
+              -DBUILD_BENCHMARK=${{ matrix.target.full }} -DBUILD_EXAMPLES=${{ matrix.target.full }} \
+              -DCMAKE_BUILD_TYPE=Release \
+              -DCMAKE_C_FLAGS="${{ matrix.target.flags }}" \
+              -DCMAKE_CXX_FLAGS="${{ matrix.target.flags }}" \
+              -DCMAKE_TOOLCHAIN_FILE=${{ github.workspace }}/.github/toolchains/${{ matrix.sys.compiler }}-${{ matrix.target.dir }}.cmake
+    - name: Build
+      run: cmake --build _build --verbose -j1
+    - name: Testing xsimd
+      run: qemu-${{ matrix.target.platform }} -L /usr/${{ matrix.target.dir}}/ ./test/test_xsimd
+      working-directory: ${{ github.workspace }}/_build
diff --git a/include/xsimd/arch/xsimd_isa.hpp b/include/xsimd/arch/xsimd_isa.hpp
index 1772159a0..8ab4261ea 100644
--- a/include/xsimd/arch/xsimd_isa.hpp
+++ b/include/xsimd/arch/xsimd_isa.hpp
@@ -136,6 +136,10 @@
 #include "./xsimd_vsx.hpp"
 #endif
 
+#if XSIMD_WITH_VXE
+#include "./xsimd_vxe.hpp"
+#endif
+
 // Must come last to have access to all conversion specializations.
 #include "./xsimd_common.hpp"
 
diff --git a/include/xsimd/arch/xsimd_vxe.hpp b/include/xsimd/arch/xsimd_vxe.hpp
new file mode 100644
index 000000000..9eeb07f19
--- /dev/null
+++ b/include/xsimd/arch/xsimd_vxe.hpp
@@ -0,0 +1,797 @@
+/***************************************************************************
+ * Copyright (c) Andreas Krebbel                                            *
+ * Based on xsimd_vsx.hpp                                                   *
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_VXE_HPP
+#define XSIMD_VXE_HPP
+
+#include <complex>
+#include <limits>
+#include <type_traits>
+
+#include "../types/xsimd_vxe_register.hpp"
+
+namespace xsimd
+{
+    namespace kernel
+    {
+        using namespace types;
+        using v1ti = __int128 __attribute__((vector_size(16)));
+        using v4sf = float __attribute__((vector_size(16)));
+        using v2df = double __attribute__((vector_size(16)));
+        using uv2di = unsigned long long int __attribute__((vector_size(16)));
+        using v2di = long long int __attribute__((vector_size(16)));
+        using uv4si = unsigned int __attribute__((vector_size(16)));
+        using v4si = int __attribute__((vector_size(16)));
+        using uv8hi = unsigned short int __attribute__((vector_size(16)));
+        using v8hi = short int __attribute__((vector_size(16)));
+        using uv16qi = unsigned char __attribute__((vector_size(16)));
+        using v16qi = signed char __attribute__((vector_size(16)));
+
+        // builtin_t<T> - the scalar type as it would be used for a vector intrinsic
+        // VXE vector intrinsics do not support long, unsigned long, and char
+        // The builtin<T> definition can be used to map the incoming
+        // type to the right one to be used with the intrinsics.
+        template <typename T>
+        struct builtin_scalar
+        {
+            using type = T;
+        };
+
+        template <>
+        struct builtin_scalar<unsigned long>
+        {
+            using type = unsigned long long;
+        };
+
+        template <>
+        struct builtin_scalar<long>
+        {
+            using type = long long;
+        };
+
+#ifdef __CHAR_UNSIGNED__
+        template <>
+        struct builtin_scalar<char>
+        {
+            using type = unsigned char;
+        };
+#else
+        template <>
+        struct builtin_scalar<char>
+        {
+            using type = signed char;
+        };
+#endif
+
+        template <typename T>
+        using builtin_t = typename builtin_scalar<T>::type;
+
+        // bitwise_cast
+        template <class A, class T_in, class T_out>
+        XSIMD_INLINE batch<T_out, A> bitwise_cast(batch<T_in, A> const& self, batch<T_out, A> const&, requires_arch<vxe>) noexcept
+        {
+            return (typename batch<T_out, A>::register_type)(self.data);
+        }
+
+        // batch_bool_cast
+        template <class A, class T_out, class T_in>
+        XSIMD_INLINE batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<vxe>) noexcept
+        {
+            return (typename batch_bool<T_out, A>::register_type)self.data;
+        }
+
+        // load
+
+        // load_unaligned
+        template <class A, class T, class = std::enable_if_t<std::is_scalar<T>::value>>
+        XSIMD_INLINE batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<vxe>) noexcept
+        {
+            return (typename batch<T, A>::register_type)vec_xl(0, (builtin_t<T>*)mem);
+        }
+
+        // load_aligned
+        template <class A, class T, class = std::enable_if_t<std::is_scalar<T>::value>>
+        XSIMD_INLINE batch<T, A> load_aligned(T const* mem, convert<T>, requires_arch<vxe>) noexcept
+        {
+            return load_unaligned<A>(mem, kernel::convert<T> {}, vxe {});
+        }
+
+        // load_complex
+        namespace detail
+        {
+            template <class A>
+            XSIMD_INLINE batch<std::complex<float>, A> load_complex(batch<float, A> const& hi, batch<float, A> const& lo, requires_arch<vxe>) noexcept
+            {
+                // Interleave real and imaginary parts
+                // hi = [r0, i0, r1, i1], lo = [r2, i2, r3, i3]
+                // We need: real = [r0, r1, r2, r3], imag = [i0, i1, i2, i3]
+                using v4sf = float __attribute__((vector_size(16)));
+                uv16qi perm_real = (uv16qi) { 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 };
+                uv16qi perm_imag = (uv16qi) { 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 };
+                v4sf real = vec_perm((v4sf)hi.data, (v4sf)lo.data, perm_real);
+                v4sf imag = vec_perm((v4sf)hi.data, (v4sf)lo.data, perm_imag);
+                return { batch<float, A>(real), batch<float, A>(imag) };
+            }
+
+            template <class A>
+            XSIMD_INLINE batch<std::complex<double>, A> load_complex(batch<double, A> const& hi, batch<double, A> const& lo, requires_arch<vxe>) noexcept
+            {
+                // hi = [r0, i0], lo = [r1, i1]
+                // We need: real = [r0, r1], imag = [i0, i1]
+                using v2df = double __attribute__((vector_size(16)));
+                uv16qi perm_real = (uv16qi) { 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 };
+                uv16qi perm_imag = (uv16qi) { 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31 };
+                v2df real = vec_perm((v2df)hi.data, (v2df)lo.data, perm_real);
+                v2df imag = vec_perm((v2df)hi.data, (v2df)lo.data, perm_imag);
+                return { batch<double, A>(real), batch<double, A>(imag) };
+            }
+
+            template <class A>
+            XSIMD_INLINE batch<float, A> complex_low(batch<std::complex<float>, A> const& self, requires_arch<vxe>) noexcept
+            {
+                uv16qi perm = (uv16qi) { 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 };
+                return batch<float, A>(vec_perm((v4sf)self.real().data, (v4sf)self.imag().data, perm));
+            }
+
+            template <class A>
+            XSIMD_INLINE batch<float, A> complex_high(batch<std::complex<float>, A> const& self, requires_arch<vxe>) noexcept
+            {
+                uv16qi perm = (uv16qi) { 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31 };
+                return batch<float, A>(vec_perm((v4sf)self.real().data, (v4sf)self.imag().data, perm));
+            }
+
+            template <class A>
+            XSIMD_INLINE batch<double, A> complex_low(batch<std::complex<double>, A> const& self, requires_arch<vxe>) noexcept
+            {
+                uv16qi perm = (uv16qi) { 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 };
+                return batch<double, A>(vec_perm((v2df)self.real().data, (v2df)self.imag().data, perm));
+            }
+
+            template <class A>
+            XSIMD_INLINE batch<double, A> complex_high(batch<std::complex<double>, A> const& self, requires_arch<vxe>) noexcept
+            {
+                uv16qi perm = (uv16qi) { 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31 };
+                return batch<double, A>(vec_perm((v2df)self.real().data, (v2df)self.imag().data, perm));
+            }
+        }
+
+        // store
+        template <class A, class T>
+        XSIMD_INLINE void store_aligned(T* dst, batch<T, A> const& src, requires_arch<vxe>) noexcept
+        {
+            vec_xst(src.data, 0, (builtin_t<T>*)dst);
+        }
+
+        template <class A, class T>
+        XSIMD_INLINE void store_unaligned(T* dst, batch<T, A> const& src, requires_arch<vxe>) noexcept
+        {
+            store_aligned<A>(dst, src, vxe {});
+        }
+
+        // set
+        template <class A, class T, class... Values>
+        XSIMD_INLINE batch<T, A> set(batch<T, A> const&, requires_arch<vxe>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch<T, A>::size, "consistent init");
+            return typename batch<T, A>::register_type { values... };
+        }
+
+        template <class A, class T, class... Values, class = std::enable_if_t<std::is_scalar<T>::value>>
+        XSIMD_INLINE batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<vxe>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch_bool<T, A>::size, "consistent init");
+            return typename batch_bool<T, A>::register_type { static_cast<decltype(std::declval<typename batch_bool<T, A>::register_type>()[0])>(values ? -1LL : 0LL)... };
+        }
+        // first
+        template <class A, class T, class = std::enable_if_t<std::is_scalar<T>::value>>
+        XSIMD_INLINE T first(batch<T, A> const& self, requires_arch<vxe>) noexcept
+        {
+            return self.data[0];
+        }
+        // insert
+        template <class A, class T, size_t I, class = std::enable_if_t<std::is_scalar<T>::value>>
+        XSIMD_INLINE batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<vxe>) noexcept
+        {
+            // vec_insert on float is broken with clang
+            batch<T, A> out(self);
+            out.data[I] = val;
+            return out;
+        }
+
+        // eq
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<vxe>) noexcept
+        {
+            return self.data == other.data;
+        }
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> eq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<vxe>) noexcept
+        {
+            return self.data == other.data;
+        }
+
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<vxe>) noexcept
+        {
+            return self.data < other.data;
+        }
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> le(batch<T, A> const& self, batch<T, A> const& other, requires_arch<vxe>) noexcept
+        {
+            return self.data <= other.data;
+        }
+
+        // neq
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<vxe>) noexcept
+        {
+            return ~(self.data == other.data);
+        }
+
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> neq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<vxe>) noexcept
+        {
+            return bitwise_xor(self, other);
+        }
+
+        // sub
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<vxe>) noexcept
+        {
+            return self.data - other.data;
+        }
+
+        // broadcast
+        template <class A, class T, class = std::enable_if_t<std::is_scalar<T>::value>>
+        XSIMD_INLINE batch<T, A> broadcast(T val, requires_arch<vxe>) noexcept
+        {
+            return vec_splats(static_cast<builtin_t<T>>(val));
+        }
+
+        // abs
+        template <class A, class T, class = typename std::enable_if<std::is_signed<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> abs(batch<T, A> const& self, requires_arch<vxe>) noexcept
+        {
+            return vec_abs(self.data);
+        }
+        // bitwise_and
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<vxe>) noexcept
+        {
+            return (typename batch<T, A>::register_type)((v4si)self.data & (v4si)other.data);
+        }
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<vxe>) noexcept
+        {
+            return self.data & other.data;
+        }
+
+        // bitwise_or
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<vxe>) noexcept
+        {
+            return (typename batch<T, A>::register_type)((v4si)self.data | (v4si)other.data);
+        }
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<vxe>) noexcept
+        {
+            return self.data | other.data;
+        }
+
+        // bitwise_xor
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<vxe>) noexcept
+        {
+            return (typename batch<T, A>::register_type)((v4si)self.data ^ (v4si)other.data);
+        }
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<vxe>) noexcept
+        {
+            return self.data ^ other.data;
+        }
+
+        // bitwise_not
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<vxe>) noexcept
+        {
+            // ~ operator does not work on floating point vectors
+            return (typename batch<T, A>::register_type)(~(v4si)self.data);
+        }
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<vxe>) noexcept
+        {
+            return ~self.data;
+        }
+
+        // bitwise_andnot
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<vxe>) noexcept
+        {
+            return (typename batch<T, A>::register_type)((v4si)self.data & ~(v4si)other.data);
+        }
+        template <class A, class T>
+        XSIMD_INLINE batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<vxe>) noexcept
+        {
+            return self.data & ~other.data;
+        }
+
+        // div
+        template <class A, class T, class = std::enable_if_t<std::is_floating_point<T>::value>>
+        XSIMD_INLINE batch<T, A> div(batch<T, A> const& self, batch<T, A> const& other, requires_arch<vxe>) noexcept
+        {
+            return self.data / other.data;
+        }
+
+        // neg
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> neg(batch<T, A> const& self, requires_arch<vxe>) noexcept
+        {
+            return (typename batch<T, A>::register_type) { 0 } - self.data;
+        }
+
+        // add
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<vxe>) noexcept
+        {
+            return self.data + other.data;
+        }
+
+        // all
+        template <class A, class T>
+        XSIMD_INLINE bool all(batch_bool<T, A> const& self, requires_arch<vxe>) noexcept
+        {
+            return ((v1ti)self.data)[0] == -1;
+        }
+
+        // any
+        template <class A, class T>
+        XSIMD_INLINE bool any(batch_bool<T, A> const& self, requires_arch<vxe>) noexcept
+        {
+            return ((v1ti)self.data)[0] != 0;
+        }
+        // avgr
+        template <class A, class T, class = std::enable_if_t<std::is_integral<T>::value>>
+        XSIMD_INLINE batch<T, A> avgr(batch<T, A> const& self, batch<T, A> const& other, requires_arch<vxe>) noexcept
+        {
+            return vec_avg(self.data, other.data);
+        }
+
+        // max
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<vxe>) noexcept
+        {
+            return vec_max(self.data, other.data);
+        }
+        // min
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<vxe>) noexcept
+        {
+            return vec_min(self.data, other.data);
+        }
+        // fma
+        template <class A, class T, class = std::enable_if_t<std::is_floating_point<T>::value>>
+        XSIMD_INLINE batch<T, A> fma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<vxe>) noexcept
+        {
+            return vec_madd(x.data, y.data, z.data);
+        }
+        // fms
+        template <class A, class T, class = std::enable_if_t<std::is_floating_point<T>::value>>
+        XSIMD_INLINE batch<T, A> fms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<vxe>) noexcept
+        {
+            return vec_msub(x.data, y.data, z.data);
+        }
+
+        // mul
+        template <class A, class T, class = std::enable_if_t<std::is_floating_point<T>::value>>
+        XSIMD_INLINE batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<vxe>) noexcept
+        {
+            return self.data * other.data;
+        }
+        // haddp
+        template <class A>
+        XSIMD_INLINE batch<float, A> haddp(batch<float, A> const* r, requires_arch<vxe>) noexcept
+        {
+            v4sf lo01, hi01, lo23, hi23, sum01, sum23, sumeven, sumodd;
+            lo01 = vec_mergel(r[0].data, r[1].data); // { r[0][2], r[1][2], r[0][3], r[1][3] }
+            hi01 = vec_mergeh(r[0].data, r[1].data); // { r[0][0], r[1][0], r[0][1], r[1][1] }
+            lo23 = vec_mergel(r[2].data, r[3].data); // { r[2][2], r[2][2], r[3][3], r[3][3] }
+            hi23 = vec_mergeh(r[2].data, r[3].data); // { r[2][0], r[2][0], r[3][1], r[3][1] }
+            sum01 = lo01 + hi01; // { r[0][0] + r[0][2], r[1][0] + r[1][2], r[0][1] + r[0][3], r[1][1] + r[1][3] }
+            sum23 = lo23 + hi23; // { r[2][0] + r[2][2], r[3][0] + r[3][2], r[2][1] + r[2][3], r[3][1] + r[3][3] }
+            sumeven = (v4sf)vec_mergeh((v2di)sum01, (v2di)sum23); // { r[0][0] + r[0][2], r[1][0] + r[1][2], r[2][0] + r[2][2], r[3][0] + r[3][2] }
+            sumodd = (v4sf)vec_mergel((v2di)sum01, (v2di)sum23); // { r[0][1] + r[0][3], r[1][1] + r[1][3], r[2][1] + r[2][3], r[3][1] + r[3][3] }
+            return sumeven + sumodd;
+        }
+        template <class A>
+        XSIMD_INLINE batch<double, A> haddp(batch<double, A> const* row, requires_arch<vxe>) noexcept
+        {
+            return vec_mergeh(row[0].data, row[1].data) + vec_mergel(row[0].data, row[1].data);
+        }
+
+        // reduce_add
+        template <class A>
+        XSIMD_INLINE float reduce_add(batch<float, A> const& self, requires_arch<vxe>) noexcept
+        {
+            v4sf shifted_64 = vec_sld(self.data, self.data, 8);
+            v4sf added_1 = self.data + shifted_64;
+            v4sf shifted_32 = vec_sld(added_1, added_1, 4);
+            return (added_1 + shifted_32)[0];
+        }
+
+        template <class A>
+        XSIMD_INLINE double reduce_add(batch<double, A> const& self, requires_arch<vxe>) noexcept
+        {
+            return (self.data + vec_sld(self.data, self.data, 8))[0];
+        }
+
+        template <class A>
+        XSIMD_INLINE uint64_t reduce_add(batch<uint64_t, A> const& self, requires_arch<vxe>) noexcept
+        {
+            uv2di shifted = vec_sld((uv2di)self.data, (uv2di)self.data, 8);
+            uv2di sum = (uv2di)self.data + shifted;
+            return (uint64_t)sum[0];
+        }
+        template <class A>
+        XSIMD_INLINE int64_t reduce_add(batch<int64_t, A> const& self, requires_arch<vxe>) noexcept
+        {
+            v2di shifted = vec_sld((v2di)self.data, (v2di)self.data, 8);
+            v2di sum = (v2di)self.data + shifted;
+            return (int64_t)sum[0];
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE T reduce_add(batch<T, A> const& self, requires_arch<vxe>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                using t = typename batch<T, A>::register_type;
+                t shifted_64 = vec_sld(self.data, self.data, 8);
+                t added_1 = self.data + shifted_64;
+                t shifted_32 = vec_sld(added_1, added_1, 4);
+                return (added_1 + shifted_32)[0];
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                using t = typename batch<T, A>::register_type;
+                t shifted_64 = vec_sld(self.data, self.data, 8);
+                t added_1 = self.data + shifted_64;
+                t shifted_32 = vec_sld(added_1, added_1, 4);
+                t added_2 = added_1 + shifted_32;
+                t shifted_16 = vec_sld(added_2, added_2, 2);
+                return (added_2 + shifted_16)[0];
+            }
+            else
+            {
+                using t = typename batch<T, A>::register_type;
+                t shifted_64 = vec_sld(self.data, self.data, 8);
+                t added_1 = self.data + shifted_64;
+                t shifted_32 = vec_sld(added_1, added_1, 4);
+                t added_2 = added_1 + shifted_32;
+                t shifted_16 = vec_sld(added_2, added_2, 2);
+                t added_3 = added_2 + shifted_16;
+                t shifted_8 = vec_sld(added_3, added_3, 1);
+                return (added_3 + shifted_8)[0];
+            }
+        }
+
+        // select
+        template <class A, class T>
+        XSIMD_INLINE batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<vxe>) noexcept
+        {
+            return vec_sel(false_br.data, true_br.data, cond.data);
+        }
+        template <class A, class T, bool... Values, class = std::enable_if_t<std::is_scalar<T>::value>>
+        XSIMD_INLINE batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<vxe>) noexcept
+        {
+            return select(batch_bool<T, A> { Values... }, true_br, false_br, vxe {});
+        }
+
+        // slide_left
+        template <size_t N, class A, class T>
+        XSIMD_INLINE batch<T, A> slide_left(batch<T, A> const& x, requires_arch<vxe>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(N == batch<T, A>::size * sizeof(T))
+            {
+                return batch<T, A>(0);
+            }
+            else
+            {
+                auto shift_count = vec_splats((uint8_t)(8 * N));
+                return vec_sll(x.data, shift_count);
+            }
+        }
+
+        // slide_right
+        template <size_t N, class A, class T>
+        XSIMD_INLINE batch<T, A> slide_right(batch<T, A> const& x, requires_arch<vxe>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(N == batch<T, A>::size * sizeof(T))
+            {
+                return batch<T, A>(0);
+            }
+            else
+            {
+                auto shift_count = vec_splats((uint8_t)(8 * N));
+                return vec_srl(x.data, shift_count);
+            }
+        }
+
+        // sqrt
+        template <class A, class T, class = std::enable_if_t<std::is_floating_point<T>::value>>
+        XSIMD_INLINE batch<T, A> sqrt(batch<T, A> const& val, requires_arch<vxe>) noexcept
+        {
+            return vec_sqrt(val.data);
+        }
+
+        // rsqrt
+        template <class A, class T, class = std::enable_if_t<std::is_floating_point<T>::value>>
+        XSIMD_INLINE batch<T, A> rsqrt(batch<T, A> const& val, requires_arch<vxe>) noexcept
+        {
+            return batch<T, A>(T(1)) / sqrt(val, vxe {});
+        }
+
+        // shuffle
+        template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3>
+        XSIMD_INLINE batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y, batch_constant<ITy, A, I0, I1, I2, I3>, requires_arch<vxe>) noexcept
+        {
+            return vec_perm(x.data, y.data,
+                            (__vector unsigned char) {
+                                4 * I0 + 0, 4 * I0 + 1, 4 * I0 + 2, 4 * I0 + 3,
+                                4 * I1 + 0, 4 * I1 + 1, 4 * I1 + 2, 4 * I1 + 3,
+                                4 * I2 + 0, 4 * I2 + 1, 4 * I2 + 2, 4 * I2 + 3,
+                                4 * I3 + 0, 4 * I3 + 1, 4 * I3 + 2, 4 * I3 + 3 });
+        }
+
+        template <class A, class ITy, ITy I0, ITy I1>
+        XSIMD_INLINE batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<ITy, A, I0, I1>, requires_arch<vxe>) noexcept
+        {
+            return vec_perm(x.data, y.data,
+                            (__vector unsigned char) {
+                                8 * I0 + 0,
+                                8 * I0 + 1,
+                                8 * I0 + 2,
+                                8 * I0 + 3,
+                                8 * I0 + 4,
+                                8 * I0 + 5,
+                                8 * I0 + 6,
+                                8 * I0 + 7,
+                                8 * I1 + 0,
+                                8 * I1 + 1,
+                                8 * I1 + 2,
+                                8 * I1 + 3,
+                                8 * I1 + 4,
+                                8 * I1 + 5,
+                                8 * I1 + 6,
+                                8 * I1 + 7,
+                            });
+        }
+
+        // swizzle
+        // 16 x 8bit
+        template <class A, uint8_t... Values>
+        XSIMD_INLINE batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch_constant<uint8_t, A, Values...>, requires_arch<vxe>) noexcept
+        {
+            static_assert(sizeof...(Values) == batch<uint8_t, A>::size, "consistent init");
+            uv16qi perm = (uv16qi) { Values... };
+            return vec_perm(self.data, self.data, perm);
+        }
+        template <class A, uint8_t... Values>
+        XSIMD_INLINE batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<uint8_t, A, Values...>, requires_arch<vxe>) noexcept
+        {
+            static_assert(sizeof...(Values) == batch<int8_t, A>::size, "consistent init");
+            uv16qi perm = (uv16qi) { Values... };
+            return vec_perm(self.data, self.data, perm);
+        }
+
+        // 8 x 16 bit
+        template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
+        XSIMD_INLINE batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<vxe>) noexcept
+        {
+            uv16qi perm = (uv16qi) {
+                2 * V0, 2 * V0 + 1,
+                2 * V1, 2 * V1 + 1,
+                2 * V2, 2 * V2 + 1,
+                2 * V3, 2 * V3 + 1,
+                2 * V4, 2 * V4 + 1,
+                2 * V5, 2 * V5 + 1,
+                2 * V6, 2 * V6 + 1,
+                2 * V7, 2 * V7 + 1
+            };
+            return vec_perm(self.data, self.data, perm);
+        }
+        template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
+        XSIMD_INLINE batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<vxe>) noexcept
+        {
+            uv16qi perm = (uv16qi) {
+                2 * V0, 2 * V0 + 1,
+                2 * V1, 2 * V1 + 1,
+                2 * V2, 2 * V2 + 1,
+                2 * V3, 2 * V3 + 1,
+                2 * V4, 2 * V4 + 1,
+                2 * V5, 2 * V5 + 1,
+                2 * V6, 2 * V6 + 1,
+                2 * V7, 2 * V7 + 1
+            };
+            return vec_perm(self.data, self.data, perm);
+        }
+
+        // 4 x 32 bit
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
+        XSIMD_INLINE batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3>, requires_arch<vxe>) noexcept
+        {
+            uv16qi perm = (uv16qi) {
+                4 * V0, 4 * V0 + 1, 4 * V0 + 2, 4 * V0 + 3,
+                4 * V1, 4 * V1 + 1, 4 * V1 + 2, 4 * V1 + 3,
+                4 * V2, 4 * V2 + 1, 4 * V2 + 2, 4 * V2 + 3,
+                4 * V3, 4 * V3 + 1, 4 * V3 + 2, 4 * V3 + 3
+            };
+            return vec_perm(self.data, self.data, perm);
+        }
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
+        XSIMD_INLINE batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3>, requires_arch<vxe>) noexcept
+        {
+            uv16qi perm = (uv16qi) {
+                4 * V0, 4 * V0 + 1, 4 * V0 + 2, 4 * V0 + 3,
+                4 * V1, 4 * V1 + 1, 4 * V1 + 2, 4 * V1 + 3,
+                4 * V2, 4 * V2 + 1, 4 * V2 + 2, 4 * V2 + 3,
+                4 * V3, 4 * V3 + 1, 4 * V3 + 2, 4 * V3 + 3
+            };
+            return vec_perm(self.data, self.data, perm);
+        }
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
+        XSIMD_INLINE batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3>, requires_arch<vxe>) noexcept
+        {
+            uv16qi perm = (uv16qi) {
+                4 * V0, 4 * V0 + 1, 4 * V0 + 2, 4 * V0 + 3,
+                4 * V1, 4 * V1 + 1, 4 * V1 + 2, 4 * V1 + 3,
+                4 * V2, 4 * V2 + 1, 4 * V2 + 2, 4 * V2 + 3,
+                4 * V3, 4 * V3 + 1, 4 * V3 + 2, 4 * V3 + 3
+            };
+            return vec_perm(self.data, self.data, perm);
+        }
+
+        // 2 x 64 bit
+        template <class A, uint64_t V0, uint64_t V1>
+        XSIMD_INLINE batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<uint64_t, A, V0, V1>, requires_arch<vxe>) noexcept
+        {
+            using out = typename batch<uint64_t, A>::register_type;
+            uv16qi perm = (uv16qi) {
+                8 * V0 + 0,
+                8 * V0 + 1,
+                8 * V0 + 2,
+                8 * V0 + 3,
+                8 * V0 + 4,
+                8 * V0 + 5,
+                8 * V0 + 6,
+                8 * V0 + 7,
+                8 * V1 + 0,
+                8 * V1 + 1,
+                8 * V1 + 2,
+                8 * V1 + 3,
+                8 * V1 + 4,
+                8 * V1 + 5,
+                8 * V1 + 6,
+                8 * V1 + 7,
+            };
+            return (out)vec_perm((uv2di)self.data, (uv2di)self.data, perm);
+        }
+        template <class A, uint64_t V0, uint64_t V1>
+        XSIMD_INLINE batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<uint64_t, A, V0, V1>, requires_arch<vxe>) noexcept
+        {
+            using out = typename batch<int64_t, A>::register_type;
+            uv16qi perm = (uv16qi) {
+                8 * V0 + 0,
+                8 * V0 + 1,
+                8 * V0 + 2,
+                8 * V0 + 3,
+                8 * V0 + 4,
+                8 * V0 + 5,
+                8 * V0 + 6,
+                8 * V0 + 7,
+                8 * V1 + 0,
+                8 * V1 + 1,
+                8 * V1 + 2,
+                8 * V1 + 3,
+                8 * V1 + 4,
+                8 * V1 + 5,
+                8 * V1 + 6,
+                8 * V1 + 7,
+            };
+            return (out)vec_perm((v2di)self.data, (v2di)self.data, perm);
+        }
+        template <class A, uint64_t V0, uint64_t V1>
+        XSIMD_INLINE batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1>, requires_arch<vxe>) noexcept
+        {
+            uv16qi perm = (uv16qi) {
+                8 * V0 + 0,
+                8 * V0 + 1,
+                8 * V0 + 2,
+                8 * V0 + 3,
+                8 * V0 + 4,
+                8 * V0 + 5,
+                8 * V0 + 6,
+                8 * V0 + 7,
+                8 * V1 + 0,
+                8 * V1 + 1,
+                8 * V1 + 2,
+                8 * V1 + 3,
+                8 * V1 + 4,
+                8 * V1 + 5,
+                8 * V1 + 6,
+                8 * V1 + 7,
+            };
+            return vec_perm(self.data, self.data, perm);
+        }
+        // zip_hi
+        template <class A, class T, class = std::enable_if_t<std::is_scalar<T>::value>>
+        XSIMD_INLINE batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<vxe>) noexcept
+        {
+            return vec_mergel(self.data, other.data);
+        }
+
+        // zip_lo
+        template <class A, class T, class = std::enable_if_t<std::is_scalar<T>::value>>
+        XSIMD_INLINE batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<vxe>) noexcept
+        {
+            return vec_mergeh(self.data, other.data);
+        }
+        // bitwise_rshift
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<vxe>) noexcept
+        {
+            return self.data >> other;
+        }
+        // bitwise_lshift
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        XSIMD_INLINE batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<vxe>) noexcept
+        {
+            return self.data << other;
+        }
+
+        // isnan
+        template <class A, class T, class = std::enable_if_t<std::is_floating_point<T>::value>>
+        XSIMD_INLINE batch_bool<T, A> isnan(batch<T, A> const& self, requires_arch<vxe>) noexcept
+        {
+            return ~vec_cmpeq(self.data, self.data);
+        }
+
+        // ceil
+        template <class A, class T, class = std::enable_if_t<std::is_floating_point<T>::value>>
+        XSIMD_INLINE batch<T, A> ceil(batch<T, A> const& self, requires_arch<vxe>) noexcept
+        {
+            return vec_ceil(self.data);
+        }
+
+        // floor
+        template <class A, class T, class = std::enable_if_t<std::is_floating_point<T>::value>>
+        XSIMD_INLINE batch<T, A> floor(batch<T, A> const& self, requires_arch<vxe>) noexcept
+        {
+            return vec_floor(self.data);
+        }
+        // round
+        // vec_round rounds ties to even instead of zero
+#if defined __has_builtin && __has_builtin(__builtin_s390_vfi)
+        template <class A, class T, class = std::enable_if_t<std::is_floating_point<T>::value>>
+        XSIMD_INLINE batch<T, A> round(batch<T, A> const& self, requires_arch<vxe>) noexcept
+        {
+            return __builtin_s390_vfi(self.data, 4, 1);
+        }
+#endif
+        // trunc
+        template <class A, class T, class = std::enable_if_t<std::is_floating_point<T>::value>>
+        XSIMD_INLINE batch<T, A> trunc(batch<T, A> const& self, requires_arch<vxe>) noexcept
+        {
+            return vec_trunc(self.data);
+        }
+    }
+}
+#endif
diff --git a/include/xsimd/config/xsimd_arch.hpp b/include/xsimd/config/xsimd_arch.hpp
index d69ff7560..23a668dc3 100644
--- a/include/xsimd/config/xsimd_arch.hpp
+++ b/include/xsimd/config/xsimd_arch.hpp
@@ -172,7 +172,8 @@ namespace xsimd
     using all_power_architectures = arch_list<vsx>;
     using all_riscv_architectures = all_rvv_architectures;
     using all_wasm_architectures = arch_list<wasm>;
-    using all_architectures = typename detail::join<all_power_architectures, all_riscv_architectures, all_wasm_architectures, all_arm_architectures, all_x86_architectures>::type;
+    using all_s390x_architectures = arch_list<vxe>;
+    using all_architectures = typename detail::join<all_power_architectures, all_riscv_architectures, all_wasm_architectures, all_arm_architectures, all_x86_architectures, all_s390x_architectures>::type;
 
     using supported_architectures = typename detail::supported<all_architectures>::type;
 
diff --git a/include/xsimd/config/xsimd_config.hpp b/include/xsimd/config/xsimd_config.hpp
index 662e550ff..0917404f4 100644
--- a/include/xsimd/config/xsimd_config.hpp
+++ b/include/xsimd/config/xsimd_config.hpp
@@ -510,6 +510,18 @@
 #define XSIMD_WITH_VSX 0
 #endif
 
+/**
++ * @ingroup xsimd_config_macro
++ *
++ * Set to 1 if s390x VXE is available at compile-time, to 0 otherwise.
++ * Float vectors have been introduced with VXE included with IBM z14.
++ */
+#if defined(__VEC__) && __VEC__ == 10305 && __ARCH__ >= 12
+#define XSIMD_WITH_VXE 1
+#else
+#define XSIMD_WITH_VXE 0
+#endif
+
 // Workaround for MSVC compiler
 #ifdef _MSC_VER
 
@@ -568,7 +580,7 @@
 
 #endif
 
-#if !XSIMD_WITH_SSE2 && !XSIMD_WITH_SSE3 && !XSIMD_WITH_SSSE3 && !XSIMD_WITH_SSE4_1 && !XSIMD_WITH_SSE4_2 && !XSIMD_WITH_AVX && !XSIMD_WITH_AVX2 && !XSIMD_WITH_AVXVNNI && !XSIMD_WITH_FMA3_SSE && !XSIMD_WITH_FMA4 && !XSIMD_WITH_FMA3_AVX && !XSIMD_WITH_FMA3_AVX2 && !XSIMD_WITH_AVX512F && !XSIMD_WITH_AVX512CD && !XSIMD_WITH_AVX512DQ && !XSIMD_WITH_AVX512BW && !XSIMD_WITH_AVX512ER && !XSIMD_WITH_AVX512PF && !XSIMD_WITH_AVX512IFMA && !XSIMD_WITH_AVX512VBMI && !XSIMD_WITH_AVX512VBMI2 && !XSIMD_WITH_NEON && !XSIMD_WITH_NEON64 && !XSIMD_WITH_SVE && !XSIMD_WITH_RVV && !XSIMD_WITH_WASM && !XSIMD_WITH_VSX && !XSIMD_WITH_EMULATED
+#if !XSIMD_WITH_SSE2 && !XSIMD_WITH_SSE3 && !XSIMD_WITH_SSSE3 && !XSIMD_WITH_SSE4_1 && !XSIMD_WITH_SSE4_2 && !XSIMD_WITH_AVX && !XSIMD_WITH_AVX2 && !XSIMD_WITH_AVXVNNI && !XSIMD_WITH_FMA3_SSE && !XSIMD_WITH_FMA4 && !XSIMD_WITH_FMA3_AVX && !XSIMD_WITH_FMA3_AVX2 && !XSIMD_WITH_AVX512F && !XSIMD_WITH_AVX512CD && !XSIMD_WITH_AVX512DQ && !XSIMD_WITH_AVX512BW && !XSIMD_WITH_AVX512ER && !XSIMD_WITH_AVX512PF && !XSIMD_WITH_AVX512IFMA && !XSIMD_WITH_AVX512VBMI && !XSIMD_WITH_AVX512VBMI2 && !XSIMD_WITH_NEON && !XSIMD_WITH_NEON64 && !XSIMD_WITH_SVE && !XSIMD_WITH_RVV && !XSIMD_WITH_WASM && !XSIMD_WITH_VSX && !XSIMD_WITH_EMULATED && !XSIMD_WITH_VXE
 #define XSIMD_NO_SUPPORTED_ARCHITECTURE
 #endif
 
diff --git a/include/xsimd/config/xsimd_cpuid.hpp b/include/xsimd/config/xsimd_cpuid.hpp
index d58897f66..0d47ca5b9 100644
--- a/include/xsimd/config/xsimd_cpuid.hpp
+++ b/include/xsimd/config/xsimd_cpuid.hpp
@@ -71,6 +71,7 @@ namespace xsimd
             ARCH_FIELD_EX(detail::rvv<128>, rvv128)
             ARCH_FIELD(wasm)
             ARCH_FIELD(vsx)
+            ARCH_FIELD(vxe)
 
 #undef ARCH_FIELD
 
diff --git a/include/xsimd/types/xsimd_all_registers.hpp b/include/xsimd/types/xsimd_all_registers.hpp
index 33f9b465d..69697ac7b 100644
--- a/include/xsimd/types/xsimd_all_registers.hpp
+++ b/include/xsimd/types/xsimd_all_registers.hpp
@@ -50,6 +50,8 @@
 
 #include "xsimd_vsx_register.hpp"
 
+#include "xsimd_vxe_register.hpp"
+
 #if XSIMD_WITH_EMULATED
 #include "xsimd_emulated_register.hpp"
 #endif
diff --git a/include/xsimd/types/xsimd_vxe_register.hpp b/include/xsimd/types/xsimd_vxe_register.hpp
new file mode 100644
index 000000000..ba051baa5
--- /dev/null
+++ b/include/xsimd/types/xsimd_vxe_register.hpp
@@ -0,0 +1,86 @@
+/***************************************************************************
+ * Copyright (c) Andreas Krebbel                                            *
+ * Based on xsimd_vsx_register.hpp                                          *
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_VXE_REGISTER_HPP
+#define XSIMD_VXE_REGISTER_HPP
+
+#include "./xsimd_common_arch.hpp"
+#include "./xsimd_register.hpp"
+
+#if XSIMD_WITH_VXE
+#include <vecintrin.h>
+#endif
+
+namespace xsimd
+{
+    /**
+     * @ingroup architectures
+     *
+     * VXE instructions
+     */
+    struct vxe : common
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_VXE; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr bool requires_alignment() noexcept { return true; }
+        static constexpr std::size_t alignment() noexcept { return 16; }
+        static constexpr char const* name() noexcept { return "vxe"; }
+    };
+
+#if XSIMD_WITH_VXE
+    namespace types
+    {
+
+#define XSIMD_DECLARE_SIMD_BOOL_VXE_REGISTER(T, Tv, Tb)              \
+    template <>                                                      \
+    struct get_bool_simd_register<T, vxe>                            \
+    {                                                                \
+        struct type                                                  \
+        {                                                            \
+            using register_type = __vector __bool Tb;                \
+            register_type data;                                      \
+            type() = default;                                        \
+            type(register_type r)                                    \
+                : data(r)                                            \
+            {                                                        \
+            }                                                        \
+            operator register_type() const noexcept { return data; } \
+        };                                                           \
+    };                                                               \
+    XSIMD_DECLARE_SIMD_REGISTER(T, vxe, __vector Tv)
+
+        // The VXE vector intrinsics do not support long, unsigned long,
+        // and char data types.  batches of these types are vectors of
+        // equivalent types.
+        XSIMD_DECLARE_SIMD_BOOL_VXE_REGISTER(signed char, signed char, char);
+        XSIMD_DECLARE_SIMD_BOOL_VXE_REGISTER(unsigned char, unsigned char, char);
+#ifdef __CHAR_UNSIGNED__
+        XSIMD_DECLARE_SIMD_BOOL_VXE_REGISTER(char, unsigned char, char);
+#else
+        XSIMD_DECLARE_SIMD_BOOL_VXE_REGISTER(char, signed char, char);
+#endif
+        XSIMD_DECLARE_SIMD_BOOL_VXE_REGISTER(unsigned short, unsigned short, short);
+        XSIMD_DECLARE_SIMD_BOOL_VXE_REGISTER(short, short, short);
+        XSIMD_DECLARE_SIMD_BOOL_VXE_REGISTER(unsigned int, unsigned int, int);
+        XSIMD_DECLARE_SIMD_BOOL_VXE_REGISTER(int, int, int);
+        XSIMD_DECLARE_SIMD_BOOL_VXE_REGISTER(unsigned long, unsigned long long, long long);
+        XSIMD_DECLARE_SIMD_BOOL_VXE_REGISTER(long, long long, long long);
+        XSIMD_DECLARE_SIMD_BOOL_VXE_REGISTER(float, float, int);
+        XSIMD_DECLARE_SIMD_BOOL_VXE_REGISTER(double, double, long long);
+
+#undef XSIMD_DECLARE_SIMD_BOOL_VXE_REGISTER
+    }
+#endif
+}
+
+#endif
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index e430901f7..feb0e6edb 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -108,6 +108,8 @@ if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID MATCHES "GNU"
         # Nothing specific
     elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "powerpc")
         # Nothing specific
+    elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x")
+        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=z14 -mzvector")
     elseif(NOT WIN32 AND NOT EMSCRIPTEN)
         if(NOT CMAKE_CXX_FLAGS MATCHES "-march" AND NOT CMAKE_CXX_FLAGS MATCHES "-arch" AND NOT CMAKE_OSX_ARCHITECTURES)
             set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=${TARGET_ARCH}")
diff --git a/test/check_arch.sh b/test/check_arch.sh
index b8e6686a6..d26ed9d4b 100644
--- a/test/check_arch.sh
+++ b/test/check_arch.sh
@@ -87,3 +87,4 @@ znver3
 znver4
 btver1
 btver2
+z14