diff --git a/.github/toolchains/gcc-s390x-linux-gnu.cmake b/.github/toolchains/gcc-s390x-linux-gnu.cmake new file mode 100644 index 000000000..05fba0b53 --- /dev/null +++ b/.github/toolchains/gcc-s390x-linux-gnu.cmake @@ -0,0 +1,4 @@ +set(CMAKE_SYSTEM_PROCESSOR s390x) +set(triple s390x-linux-gnu) + +include(${CMAKE_CURRENT_LIST_DIR}/gcc.cmake) diff --git a/.github/workflows/cross-s390x.yml b/.github/workflows/cross-s390x.yml new file mode 100644 index 000000000..2f98aae77 --- /dev/null +++ b/.github/workflows/cross-s390x.yml @@ -0,0 +1,48 @@ +name: IBM Z cross-compilation build +on: [push, pull_request] +concurrency: + group: ${{ github.workflow }}-${{ github.job }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + build: + runs-on: ubuntu-latest + name: '${{ matrix.target.platform }}, ${{ matrix.sys.compiler }} ${{ matrix.sys.version }}' + strategy: + matrix: + target: + - { platform: 's390x', dir: 's390x-linux-gnu', flags: '-mzvector -march=z14', full: 'OFF' } + sys: + - { compiler: 'gcc', version: '14' } + steps: + - name: Setup compiler + if: ${{ matrix.sys.compiler == 'gcc' }} + run: | + sudo apt-get update || exit 1 + sudo apt-get -y --no-install-suggests --no-install-recommends install g++-${{ matrix.sys.version }}-${{ matrix.target.dir }} g++-${{ matrix.sys.version }}-multilib cmake || exit 1 + sudo update-alternatives --remove-all ${{ matrix.target.dir }}-gcc || true + sudo update-alternatives --remove-all ${{ matrix.target.dir }}-g++ || true + sudo update-alternatives --install /usr/bin/${{ matrix.target.dir }}-gcc ${{ matrix.target.dir }}-gcc /usr/bin/${{ matrix.target.dir }}-gcc-${{ matrix.sys.version }} 20 + sudo update-alternatives --install /usr/bin/${{ matrix.target.dir }}-g++ ${{ matrix.target.dir }}-g++ /usr/bin/${{ matrix.target.dir }}-g++-${{ matrix.sys.version }} 20 + - name: Setup QEMU + run: | + sudo apt-get --no-install-suggests --no-install-recommends install qemu-user + - name: Setup Ninja + run: | + sudo apt-get install ninja-build + - name: Checkout xsimd + uses: actions/checkout@v6 + - name: Setup + run: | + cmake -B _build \ + -DBUILD_TESTS=ON -DDOWNLOAD_DOCTEST=ON \ + -DBUILD_BENCHMARK=${{ matrix.target.full }} -DBUILD_EXAMPLES=${{ matrix.target.full }} \ + -DCMAKE_BUILD_TYPE=Release \ + -DCMAKE_C_FLAGS="${{ matrix.target.flags }}" \ + -DCMAKE_CXX_FLAGS="${{ matrix.target.flags }}" \ + -DCMAKE_TOOLCHAIN_FILE=${{ github.workspace }}/.github/toolchains/${{ matrix.sys.compiler }}-${{ matrix.target.dir }}.cmake + - name: Build + run: cmake --build _build --verbose -j1 + - name: Testing xsimd + run: qemu-${{ matrix.target.platform }} -L /usr/${{ matrix.target.dir}}/ ./test/test_xsimd + working-directory: ${{ github.workspace }}/_build diff --git a/include/xsimd/arch/xsimd_isa.hpp b/include/xsimd/arch/xsimd_isa.hpp index 1772159a0..8ab4261ea 100644 --- a/include/xsimd/arch/xsimd_isa.hpp +++ b/include/xsimd/arch/xsimd_isa.hpp @@ -136,6 +136,10 @@ #include "./xsimd_vsx.hpp" #endif +#if XSIMD_WITH_VXE +#include "./xsimd_vxe.hpp" +#endif + // Must come last to have access to all conversion specializations. #include "./xsimd_common.hpp" diff --git a/include/xsimd/arch/xsimd_vxe.hpp b/include/xsimd/arch/xsimd_vxe.hpp new file mode 100644 index 000000000..9eeb07f19 --- /dev/null +++ b/include/xsimd/arch/xsimd_vxe.hpp @@ -0,0 +1,797 @@ +/*************************************************************************** + * Copyright (c) Andreas Krebbel * + * Based on xsimd_vsx.hpp * + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_VXE_HPP +#define XSIMD_VXE_HPP + +#include +#include +#include + +#include "../types/xsimd_vxe_register.hpp" + +namespace xsimd +{ + namespace kernel + { + using namespace types; + using v1ti = __int128 __attribute__((vector_size(16))); + using v4sf = float __attribute__((vector_size(16))); + using v2df = double __attribute__((vector_size(16))); + using uv2di = unsigned long long int __attribute__((vector_size(16))); + using v2di = long long int __attribute__((vector_size(16))); + using uv4si = unsigned int __attribute__((vector_size(16))); + using v4si = int __attribute__((vector_size(16))); + using uv8hi = unsigned short int __attribute__((vector_size(16))); + using v8hi = short int __attribute__((vector_size(16))); + using uv16qi = unsigned char __attribute__((vector_size(16))); + using v16qi = signed char __attribute__((vector_size(16))); + + // builtin_t - the scalar type as it would be used for a vector intrinsic + // VXE vector intrinsics do not support long, unsigned long, and char + // The builtin definition can be used to map the incoming + // type to the right one to be used with the intrinsics. + template + struct builtin_scalar + { + using type = T; + }; + + template <> + struct builtin_scalar + { + using type = unsigned long long; + }; + + template <> + struct builtin_scalar + { + using type = long long; + }; + +#ifdef __CHAR_UNSIGNED__ + template <> + struct builtin_scalar + { + using type = unsigned char; + }; +#else + template <> + struct builtin_scalar + { + using type = signed char; + }; +#endif + + template + using builtin_t = typename builtin_scalar::type; + + // bitwise_cast + template + XSIMD_INLINE batch bitwise_cast(batch const& self, batch const&, requires_arch) noexcept + { + return (typename batch::register_type)(self.data); + } + + // batch_bool_cast + template + XSIMD_INLINE batch_bool batch_bool_cast(batch_bool const& self, batch_bool const&, requires_arch) noexcept + { + return (typename batch_bool::register_type)self.data; + } + + // load + + // load_unaligned + template ::value>> + XSIMD_INLINE batch load_unaligned(T const* mem, convert, requires_arch) noexcept + { + return (typename batch::register_type)vec_xl(0, (builtin_t*)mem); + } + + // load_aligned + template ::value>> + XSIMD_INLINE batch load_aligned(T const* mem, convert, requires_arch) noexcept + { + return load_unaligned(mem, kernel::convert {}, vxe {}); + } + + // load_complex + namespace detail + { + template + XSIMD_INLINE batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) noexcept + { + // Interleave real and imaginary parts + // hi = [r0, i0, r1, i1], lo = [r2, i2, r3, i3] + // We need: real = [r0, r1, r2, r3], imag = [i0, i1, i2, i3] + using v4sf = float __attribute__((vector_size(16))); + uv16qi perm_real = (uv16qi) { 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27 }; + uv16qi perm_imag = (uv16qi) { 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31 }; + v4sf real = vec_perm((v4sf)hi.data, (v4sf)lo.data, perm_real); + v4sf imag = vec_perm((v4sf)hi.data, (v4sf)lo.data, perm_imag); + return { batch(real), batch(imag) }; + } + + template + XSIMD_INLINE batch, A> load_complex(batch const& hi, batch const& lo, requires_arch) noexcept + { + // hi = [r0, i0], lo = [r1, i1] + // We need: real = [r0, r1], imag = [i0, i1] + using v2df = double __attribute__((vector_size(16))); + uv16qi perm_real = (uv16qi) { 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 }; + uv16qi perm_imag = (uv16qi) { 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31 }; + v2df real = vec_perm((v2df)hi.data, (v2df)lo.data, perm_real); + v2df imag = vec_perm((v2df)hi.data, (v2df)lo.data, perm_imag); + return { batch(real), batch(imag) }; + } + + template + XSIMD_INLINE batch complex_low(batch, A> const& self, requires_arch) noexcept + { + uv16qi perm = (uv16qi) { 0, 1, 2, 3, 16, 17, 18, 19, 4, 5, 6, 7, 20, 21, 22, 23 }; + return batch(vec_perm((v4sf)self.real().data, (v4sf)self.imag().data, perm)); + } + + template + XSIMD_INLINE batch complex_high(batch, A> const& self, requires_arch) noexcept + { + uv16qi perm = (uv16qi) { 8, 9, 10, 11, 24, 25, 26, 27, 12, 13, 14, 15, 28, 29, 30, 31 }; + return batch(vec_perm((v4sf)self.real().data, (v4sf)self.imag().data, perm)); + } + + template + XSIMD_INLINE batch complex_low(batch, A> const& self, requires_arch) noexcept + { + uv16qi perm = (uv16qi) { 0, 1, 2, 3, 4, 5, 6, 7, 16, 17, 18, 19, 20, 21, 22, 23 }; + return batch(vec_perm((v2df)self.real().data, (v2df)self.imag().data, perm)); + } + + template + XSIMD_INLINE batch complex_high(batch, A> const& self, requires_arch) noexcept + { + uv16qi perm = (uv16qi) { 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26, 27, 28, 29, 30, 31 }; + return batch(vec_perm((v2df)self.real().data, (v2df)self.imag().data, perm)); + } + } + + // store + template + XSIMD_INLINE void store_aligned(T* dst, batch const& src, requires_arch) noexcept + { + vec_xst(src.data, 0, (builtin_t*)dst); + } + + template + XSIMD_INLINE void store_unaligned(T* dst, batch const& src, requires_arch) noexcept + { + store_aligned(dst, src, vxe {}); + } + + // set + template + XSIMD_INLINE batch set(batch const&, requires_arch, Values... values) noexcept + { + static_assert(sizeof...(Values) == batch::size, "consistent init"); + return typename batch::register_type { values... }; + } + + template ::value>> + XSIMD_INLINE batch_bool set(batch_bool const&, requires_arch, Values... values) noexcept + { + static_assert(sizeof...(Values) == batch_bool::size, "consistent init"); + return typename batch_bool::register_type { static_cast::register_type>()[0])>(values ? -1LL : 0LL)... }; + } + // first + template ::value>> + XSIMD_INLINE T first(batch const& self, requires_arch) noexcept + { + return self.data[0]; + } + // insert + template ::value>> + XSIMD_INLINE batch insert(batch const& self, T val, index, requires_arch) noexcept + { + // vec_insert on float is broken with clang + batch out(self); + out.data[I] = val; + return out; + } + + // eq + template + XSIMD_INLINE batch_bool eq(batch const& self, batch const& other, requires_arch) noexcept + { + return self.data == other.data; + } + template + XSIMD_INLINE batch_bool eq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return self.data == other.data; + } + + template + XSIMD_INLINE batch_bool lt(batch const& self, batch const& other, requires_arch) noexcept + { + return self.data < other.data; + } + template + XSIMD_INLINE batch_bool le(batch const& self, batch const& other, requires_arch) noexcept + { + return self.data <= other.data; + } + + // neq + template + XSIMD_INLINE batch_bool neq(batch const& self, batch const& other, requires_arch) noexcept + { + return ~(self.data == other.data); + } + + template + XSIMD_INLINE batch_bool neq(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return bitwise_xor(self, other); + } + + // sub + template + XSIMD_INLINE batch sub(batch const& self, batch const& other, requires_arch) noexcept + { + return self.data - other.data; + } + + // broadcast + template ::value>> + XSIMD_INLINE batch broadcast(T val, requires_arch) noexcept + { + return vec_splats(static_cast>(val)); + } + + // abs + template ::value, void>::type> + XSIMD_INLINE batch abs(batch const& self, requires_arch) noexcept + { + return vec_abs(self.data); + } + // bitwise_and + template + XSIMD_INLINE batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch::register_type)((v4si)self.data & (v4si)other.data); + } + template + XSIMD_INLINE batch_bool bitwise_and(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return self.data & other.data; + } + + // bitwise_or + template + XSIMD_INLINE batch bitwise_or(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch::register_type)((v4si)self.data | (v4si)other.data); + } + template + XSIMD_INLINE batch_bool bitwise_or(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return self.data | other.data; + } + + // bitwise_xor + template + XSIMD_INLINE batch bitwise_xor(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch::register_type)((v4si)self.data ^ (v4si)other.data); + } + template + XSIMD_INLINE batch_bool bitwise_xor(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return self.data ^ other.data; + } + + // bitwise_not + template + XSIMD_INLINE batch bitwise_not(batch const& self, requires_arch) noexcept + { + // ~ operator does not work on floating point vectors + return (typename batch::register_type)(~(v4si)self.data); + } + template + XSIMD_INLINE batch_bool bitwise_not(batch_bool const& self, requires_arch) noexcept + { + return ~self.data; + } + + // bitwise_andnot + template + XSIMD_INLINE batch bitwise_andnot(batch const& self, batch const& other, requires_arch) noexcept + { + return (typename batch::register_type)((v4si)self.data & ~(v4si)other.data); + } + template + XSIMD_INLINE batch_bool bitwise_andnot(batch_bool const& self, batch_bool const& other, requires_arch) noexcept + { + return self.data & ~other.data; + } + + // div + template ::value>> + XSIMD_INLINE batch div(batch const& self, batch const& other, requires_arch) noexcept + { + return self.data / other.data; + } + + // neg + template + XSIMD_INLINE batch neg(batch const& self, requires_arch) noexcept + { + return (typename batch::register_type) { 0 } - self.data; + } + + // add + template + XSIMD_INLINE batch add(batch const& self, batch const& other, requires_arch) noexcept + { + return self.data + other.data; + } + + // all + template + XSIMD_INLINE bool all(batch_bool const& self, requires_arch) noexcept + { + return ((v1ti)self.data)[0] == -1; + } + + // any + template + XSIMD_INLINE bool any(batch_bool const& self, requires_arch) noexcept + { + return ((v1ti)self.data)[0] != 0; + } + // avgr + template ::value>> + XSIMD_INLINE batch avgr(batch const& self, batch const& other, requires_arch) noexcept + { + return vec_avg(self.data, other.data); + } + + // max + template + XSIMD_INLINE batch max(batch const& self, batch const& other, requires_arch) noexcept + { + return vec_max(self.data, other.data); + } + // min + template + XSIMD_INLINE batch min(batch const& self, batch const& other, requires_arch) noexcept + { + return vec_min(self.data, other.data); + } + // fma + template ::value>> + XSIMD_INLINE batch fma(batch const& x, batch const& y, batch const& z, requires_arch) noexcept + { + return vec_madd(x.data, y.data, z.data); + } + // fms + template ::value>> + XSIMD_INLINE batch fms(batch const& x, batch const& y, batch const& z, requires_arch) noexcept + { + return vec_msub(x.data, y.data, z.data); + } + + // mul + template ::value>> + XSIMD_INLINE batch mul(batch const& self, batch const& other, requires_arch) noexcept + { + return self.data * other.data; + } + // haddp + template + XSIMD_INLINE batch haddp(batch const* r, requires_arch) noexcept + { + v4sf lo01, hi01, lo23, hi23, sum01, sum23, sumeven, sumodd; + lo01 = vec_mergel(r[0].data, r[1].data); // { r[0][2], r[1][2], r[0][3], r[1][3] } + hi01 = vec_mergeh(r[0].data, r[1].data); // { r[0][0], r[1][0], r[0][1], r[1][1] } + lo23 = vec_mergel(r[2].data, r[3].data); // { r[2][2], r[2][2], r[3][3], r[3][3] } + hi23 = vec_mergeh(r[2].data, r[3].data); // { r[2][0], r[2][0], r[3][1], r[3][1] } + sum01 = lo01 + hi01; // { r[0][0] + r[0][2], r[1][0] + r[1][2], r[0][1] + r[0][3], r[1][1] + r[1][3] } + sum23 = lo23 + hi23; // { r[2][0] + r[2][2], r[3][0] + r[3][2], r[2][1] + r[2][3], r[3][1] + r[3][3] } + sumeven = (v4sf)vec_mergeh((v2di)sum01, (v2di)sum23); // { r[0][0] + r[0][2], r[1][0] + r[1][2], r[2][0] + r[2][2], r[3][0] + r[3][2] } + sumodd = (v4sf)vec_mergel((v2di)sum01, (v2di)sum23); // { r[0][1] + r[0][3], r[1][1] + r[1][3], r[2][1] + r[2][3], r[3][1] + r[3][3] } + return sumeven + sumodd; + } + template + XSIMD_INLINE batch haddp(batch const* row, requires_arch) noexcept + { + return vec_mergeh(row[0].data, row[1].data) + vec_mergel(row[0].data, row[1].data); + } + + // reduce_add + template + XSIMD_INLINE float reduce_add(batch const& self, requires_arch) noexcept + { + v4sf shifted_64 = vec_sld(self.data, self.data, 8); + v4sf added_1 = self.data + shifted_64; + v4sf shifted_32 = vec_sld(added_1, added_1, 4); + return (added_1 + shifted_32)[0]; + } + + template + XSIMD_INLINE double reduce_add(batch const& self, requires_arch) noexcept + { + return (self.data + vec_sld(self.data, self.data, 8))[0]; + } + + template + XSIMD_INLINE uint64_t reduce_add(batch const& self, requires_arch) noexcept + { + uv2di shifted = vec_sld((uv2di)self.data, (uv2di)self.data, 8); + uv2di sum = (uv2di)self.data + shifted; + return (uint64_t)sum[0]; + } + template + XSIMD_INLINE int64_t reduce_add(batch const& self, requires_arch) noexcept + { + v2di shifted = vec_sld((v2di)self.data, (v2di)self.data, 8); + v2di sum = (v2di)self.data + shifted; + return (int64_t)sum[0]; + } + template ::value, void>::type> + XSIMD_INLINE T reduce_add(batch const& self, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 4) + { + using t = typename batch::register_type; + t shifted_64 = vec_sld(self.data, self.data, 8); + t added_1 = self.data + shifted_64; + t shifted_32 = vec_sld(added_1, added_1, 4); + return (added_1 + shifted_32)[0]; + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + using t = typename batch::register_type; + t shifted_64 = vec_sld(self.data, self.data, 8); + t added_1 = self.data + shifted_64; + t shifted_32 = vec_sld(added_1, added_1, 4); + t added_2 = added_1 + shifted_32; + t shifted_16 = vec_sld(added_2, added_2, 2); + return (added_2 + shifted_16)[0]; + } + else + { + using t = typename batch::register_type; + t shifted_64 = vec_sld(self.data, self.data, 8); + t added_1 = self.data + shifted_64; + t shifted_32 = vec_sld(added_1, added_1, 4); + t added_2 = added_1 + shifted_32; + t shifted_16 = vec_sld(added_2, added_2, 2); + t added_3 = added_2 + shifted_16; + t shifted_8 = vec_sld(added_3, added_3, 1); + return (added_3 + shifted_8)[0]; + } + } + + // select + template + XSIMD_INLINE batch select(batch_bool const& cond, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + return vec_sel(false_br.data, true_br.data, cond.data); + } + template ::value>> + XSIMD_INLINE batch select(batch_bool_constant const&, batch const& true_br, batch const& false_br, requires_arch) noexcept + { + return select(batch_bool { Values... }, true_br, false_br, vxe {}); + } + + // slide_left + template + XSIMD_INLINE batch slide_left(batch const& x, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(N == batch::size * sizeof(T)) + { + return batch(0); + } + else + { + auto shift_count = vec_splats((uint8_t)(8 * N)); + return vec_sll(x.data, shift_count); + } + } + + // slide_right + template + XSIMD_INLINE batch slide_right(batch const& x, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(N == batch::size * sizeof(T)) + { + return batch(0); + } + else + { + auto shift_count = vec_splats((uint8_t)(8 * N)); + return vec_srl(x.data, shift_count); + } + } + + // sqrt + template ::value>> + XSIMD_INLINE batch sqrt(batch const& val, requires_arch) noexcept + { + return vec_sqrt(val.data); + } + + // rsqrt + template ::value>> + XSIMD_INLINE batch rsqrt(batch const& val, requires_arch) noexcept + { + return batch(T(1)) / sqrt(val, vxe {}); + } + + // shuffle + template + XSIMD_INLINE batch shuffle(batch const& x, batch const& y, batch_constant, requires_arch) noexcept + { + return vec_perm(x.data, y.data, + (__vector unsigned char) { + 4 * I0 + 0, 4 * I0 + 1, 4 * I0 + 2, 4 * I0 + 3, + 4 * I1 + 0, 4 * I1 + 1, 4 * I1 + 2, 4 * I1 + 3, + 4 * I2 + 0, 4 * I2 + 1, 4 * I2 + 2, 4 * I2 + 3, + 4 * I3 + 0, 4 * I3 + 1, 4 * I3 + 2, 4 * I3 + 3 }); + } + + template + XSIMD_INLINE batch shuffle(batch const& x, batch const& y, batch_constant, requires_arch) noexcept + { + return vec_perm(x.data, y.data, + (__vector unsigned char) { + 8 * I0 + 0, + 8 * I0 + 1, + 8 * I0 + 2, + 8 * I0 + 3, + 8 * I0 + 4, + 8 * I0 + 5, + 8 * I0 + 6, + 8 * I0 + 7, + 8 * I1 + 0, + 8 * I1 + 1, + 8 * I1 + 2, + 8 * I1 + 3, + 8 * I1 + 4, + 8 * I1 + 5, + 8 * I1 + 6, + 8 * I1 + 7, + }); + } + + // swizzle + // 16 x 8bit + template + XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept + { + static_assert(sizeof...(Values) == batch::size, "consistent init"); + uv16qi perm = (uv16qi) { Values... }; + return vec_perm(self.data, self.data, perm); + } + template + XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept + { + static_assert(sizeof...(Values) == batch::size, "consistent init"); + uv16qi perm = (uv16qi) { Values... }; + return vec_perm(self.data, self.data, perm); + } + + // 8 x 16 bit + template + XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept + { + uv16qi perm = (uv16qi) { + 2 * V0, 2 * V0 + 1, + 2 * V1, 2 * V1 + 1, + 2 * V2, 2 * V2 + 1, + 2 * V3, 2 * V3 + 1, + 2 * V4, 2 * V4 + 1, + 2 * V5, 2 * V5 + 1, + 2 * V6, 2 * V6 + 1, + 2 * V7, 2 * V7 + 1 + }; + return vec_perm(self.data, self.data, perm); + } + template + XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept + { + uv16qi perm = (uv16qi) { + 2 * V0, 2 * V0 + 1, + 2 * V1, 2 * V1 + 1, + 2 * V2, 2 * V2 + 1, + 2 * V3, 2 * V3 + 1, + 2 * V4, 2 * V4 + 1, + 2 * V5, 2 * V5 + 1, + 2 * V6, 2 * V6 + 1, + 2 * V7, 2 * V7 + 1 + }; + return vec_perm(self.data, self.data, perm); + } + + // 4 x 32 bit + template + XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept + { + uv16qi perm = (uv16qi) { + 4 * V0, 4 * V0 + 1, 4 * V0 + 2, 4 * V0 + 3, + 4 * V1, 4 * V1 + 1, 4 * V1 + 2, 4 * V1 + 3, + 4 * V2, 4 * V2 + 1, 4 * V2 + 2, 4 * V2 + 3, + 4 * V3, 4 * V3 + 1, 4 * V3 + 2, 4 * V3 + 3 + }; + return vec_perm(self.data, self.data, perm); + } + template + XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept + { + uv16qi perm = (uv16qi) { + 4 * V0, 4 * V0 + 1, 4 * V0 + 2, 4 * V0 + 3, + 4 * V1, 4 * V1 + 1, 4 * V1 + 2, 4 * V1 + 3, + 4 * V2, 4 * V2 + 1, 4 * V2 + 2, 4 * V2 + 3, + 4 * V3, 4 * V3 + 1, 4 * V3 + 2, 4 * V3 + 3 + }; + return vec_perm(self.data, self.data, perm); + } + template + XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept + { + uv16qi perm = (uv16qi) { + 4 * V0, 4 * V0 + 1, 4 * V0 + 2, 4 * V0 + 3, + 4 * V1, 4 * V1 + 1, 4 * V1 + 2, 4 * V1 + 3, + 4 * V2, 4 * V2 + 1, 4 * V2 + 2, 4 * V2 + 3, + 4 * V3, 4 * V3 + 1, 4 * V3 + 2, 4 * V3 + 3 + }; + return vec_perm(self.data, self.data, perm); + } + + // 2 x 64 bit + template + XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept + { + using out = typename batch::register_type; + uv16qi perm = (uv16qi) { + 8 * V0 + 0, + 8 * V0 + 1, + 8 * V0 + 2, + 8 * V0 + 3, + 8 * V0 + 4, + 8 * V0 + 5, + 8 * V0 + 6, + 8 * V0 + 7, + 8 * V1 + 0, + 8 * V1 + 1, + 8 * V1 + 2, + 8 * V1 + 3, + 8 * V1 + 4, + 8 * V1 + 5, + 8 * V1 + 6, + 8 * V1 + 7, + }; + return (out)vec_perm((uv2di)self.data, (uv2di)self.data, perm); + } + template + XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept + { + using out = typename batch::register_type; + uv16qi perm = (uv16qi) { + 8 * V0 + 0, + 8 * V0 + 1, + 8 * V0 + 2, + 8 * V0 + 3, + 8 * V0 + 4, + 8 * V0 + 5, + 8 * V0 + 6, + 8 * V0 + 7, + 8 * V1 + 0, + 8 * V1 + 1, + 8 * V1 + 2, + 8 * V1 + 3, + 8 * V1 + 4, + 8 * V1 + 5, + 8 * V1 + 6, + 8 * V1 + 7, + }; + return (out)vec_perm((v2di)self.data, (v2di)self.data, perm); + } + template + XSIMD_INLINE batch swizzle(batch const& self, batch_constant, requires_arch) noexcept + { + uv16qi perm = (uv16qi) { + 8 * V0 + 0, + 8 * V0 + 1, + 8 * V0 + 2, + 8 * V0 + 3, + 8 * V0 + 4, + 8 * V0 + 5, + 8 * V0 + 6, + 8 * V0 + 7, + 8 * V1 + 0, + 8 * V1 + 1, + 8 * V1 + 2, + 8 * V1 + 3, + 8 * V1 + 4, + 8 * V1 + 5, + 8 * V1 + 6, + 8 * V1 + 7, + }; + return vec_perm(self.data, self.data, perm); + } + // zip_hi + template ::value>> + XSIMD_INLINE batch zip_hi(batch const& self, batch const& other, requires_arch) noexcept + { + return vec_mergel(self.data, other.data); + } + + // zip_lo + template ::value>> + XSIMD_INLINE batch zip_lo(batch const& self, batch const& other, requires_arch) noexcept + { + return vec_mergeh(self.data, other.data); + } + // bitwise_rshift + template ::value, void>::type> + XSIMD_INLINE batch bitwise_rshift(batch const& self, int32_t other, requires_arch) noexcept + { + return self.data >> other; + } + // bitwise_lshift + template ::value, void>::type> + XSIMD_INLINE batch bitwise_lshift(batch const& self, int32_t other, requires_arch) noexcept + { + return self.data << other; + } + + // isnan + template ::value>> + XSIMD_INLINE batch_bool isnan(batch const& self, requires_arch) noexcept + { + return ~vec_cmpeq(self.data, self.data); + } + + // ceil + template ::value>> + XSIMD_INLINE batch ceil(batch const& self, requires_arch) noexcept + { + return vec_ceil(self.data); + } + + // floor + template ::value>> + XSIMD_INLINE batch floor(batch const& self, requires_arch) noexcept + { + return vec_floor(self.data); + } + // round + // vec_round rounds ties to even instead of zero +#if defined __has_builtin && __has_builtin(__builtin_s390_vfi) + template ::value>> + XSIMD_INLINE batch round(batch const& self, requires_arch) noexcept + { + return __builtin_s390_vfi(self.data, 4, 1); + } +#endif + // trunc + template ::value>> + XSIMD_INLINE batch trunc(batch const& self, requires_arch) noexcept + { + return vec_trunc(self.data); + } + } +} +#endif diff --git a/include/xsimd/config/xsimd_arch.hpp b/include/xsimd/config/xsimd_arch.hpp index d69ff7560..23a668dc3 100644 --- a/include/xsimd/config/xsimd_arch.hpp +++ b/include/xsimd/config/xsimd_arch.hpp @@ -172,7 +172,8 @@ namespace xsimd using all_power_architectures = arch_list; using all_riscv_architectures = all_rvv_architectures; using all_wasm_architectures = arch_list; - using all_architectures = typename detail::join::type; + using all_s390x_architectures = arch_list; + using all_architectures = typename detail::join::type; using supported_architectures = typename detail::supported::type; diff --git a/include/xsimd/config/xsimd_config.hpp b/include/xsimd/config/xsimd_config.hpp index 662e550ff..0917404f4 100644 --- a/include/xsimd/config/xsimd_config.hpp +++ b/include/xsimd/config/xsimd_config.hpp @@ -510,6 +510,18 @@ #define XSIMD_WITH_VSX 0 #endif +/** ++ * @ingroup xsimd_config_macro ++ * ++ * Set to 1 if s390x VXE is available at compile-time, to 0 otherwise. ++ * Float vectors have been introduced with VXE included with IBM z14. ++ */ +#if defined(__VEC__) && __VEC__ == 10305 && __ARCH__ >= 12 +#define XSIMD_WITH_VXE 1 +#else +#define XSIMD_WITH_VXE 0 +#endif + // Workaround for MSVC compiler #ifdef _MSC_VER @@ -568,7 +580,7 @@ #endif -#if !XSIMD_WITH_SSE2 && !XSIMD_WITH_SSE3 && !XSIMD_WITH_SSSE3 && !XSIMD_WITH_SSE4_1 && !XSIMD_WITH_SSE4_2 && !XSIMD_WITH_AVX && !XSIMD_WITH_AVX2 && !XSIMD_WITH_AVXVNNI && !XSIMD_WITH_FMA3_SSE && !XSIMD_WITH_FMA4 && !XSIMD_WITH_FMA3_AVX && !XSIMD_WITH_FMA3_AVX2 && !XSIMD_WITH_AVX512F && !XSIMD_WITH_AVX512CD && !XSIMD_WITH_AVX512DQ && !XSIMD_WITH_AVX512BW && !XSIMD_WITH_AVX512ER && !XSIMD_WITH_AVX512PF && !XSIMD_WITH_AVX512IFMA && !XSIMD_WITH_AVX512VBMI && !XSIMD_WITH_AVX512VBMI2 && !XSIMD_WITH_NEON && !XSIMD_WITH_NEON64 && !XSIMD_WITH_SVE && !XSIMD_WITH_RVV && !XSIMD_WITH_WASM && !XSIMD_WITH_VSX && !XSIMD_WITH_EMULATED +#if !XSIMD_WITH_SSE2 && !XSIMD_WITH_SSE3 && !XSIMD_WITH_SSSE3 && !XSIMD_WITH_SSE4_1 && !XSIMD_WITH_SSE4_2 && !XSIMD_WITH_AVX && !XSIMD_WITH_AVX2 && !XSIMD_WITH_AVXVNNI && !XSIMD_WITH_FMA3_SSE && !XSIMD_WITH_FMA4 && !XSIMD_WITH_FMA3_AVX && !XSIMD_WITH_FMA3_AVX2 && !XSIMD_WITH_AVX512F && !XSIMD_WITH_AVX512CD && !XSIMD_WITH_AVX512DQ && !XSIMD_WITH_AVX512BW && !XSIMD_WITH_AVX512ER && !XSIMD_WITH_AVX512PF && !XSIMD_WITH_AVX512IFMA && !XSIMD_WITH_AVX512VBMI && !XSIMD_WITH_AVX512VBMI2 && !XSIMD_WITH_NEON && !XSIMD_WITH_NEON64 && !XSIMD_WITH_SVE && !XSIMD_WITH_RVV && !XSIMD_WITH_WASM && !XSIMD_WITH_VSX && !XSIMD_WITH_EMULATED && !XSIMD_WITH_VXE #define XSIMD_NO_SUPPORTED_ARCHITECTURE #endif diff --git a/include/xsimd/config/xsimd_cpuid.hpp b/include/xsimd/config/xsimd_cpuid.hpp index d58897f66..0d47ca5b9 100644 --- a/include/xsimd/config/xsimd_cpuid.hpp +++ b/include/xsimd/config/xsimd_cpuid.hpp @@ -71,6 +71,7 @@ namespace xsimd ARCH_FIELD_EX(detail::rvv<128>, rvv128) ARCH_FIELD(wasm) ARCH_FIELD(vsx) + ARCH_FIELD(vxe) #undef ARCH_FIELD diff --git a/include/xsimd/types/xsimd_all_registers.hpp b/include/xsimd/types/xsimd_all_registers.hpp index 33f9b465d..69697ac7b 100644 --- a/include/xsimd/types/xsimd_all_registers.hpp +++ b/include/xsimd/types/xsimd_all_registers.hpp @@ -50,6 +50,8 @@ #include "xsimd_vsx_register.hpp" +#include "xsimd_vxe_register.hpp" + #if XSIMD_WITH_EMULATED #include "xsimd_emulated_register.hpp" #endif diff --git a/include/xsimd/types/xsimd_vxe_register.hpp b/include/xsimd/types/xsimd_vxe_register.hpp new file mode 100644 index 000000000..ba051baa5 --- /dev/null +++ b/include/xsimd/types/xsimd_vxe_register.hpp @@ -0,0 +1,86 @@ +/*************************************************************************** + * Copyright (c) Andreas Krebbel * + * Based on xsimd_vsx_register.hpp * + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_VXE_REGISTER_HPP +#define XSIMD_VXE_REGISTER_HPP + +#include "./xsimd_common_arch.hpp" +#include "./xsimd_register.hpp" + +#if XSIMD_WITH_VXE +#include +#endif + +namespace xsimd +{ + /** + * @ingroup architectures + * + * VXE instructions + */ + struct vxe : common + { + static constexpr bool supported() noexcept { return XSIMD_WITH_VXE; } + static constexpr bool available() noexcept { return true; } + static constexpr bool requires_alignment() noexcept { return true; } + static constexpr std::size_t alignment() noexcept { return 16; } + static constexpr char const* name() noexcept { return "vxe"; } + }; + +#if XSIMD_WITH_VXE + namespace types + { + +#define XSIMD_DECLARE_SIMD_BOOL_VXE_REGISTER(T, Tv, Tb) \ + template <> \ + struct get_bool_simd_register \ + { \ + struct type \ + { \ + using register_type = __vector __bool Tb; \ + register_type data; \ + type() = default; \ + type(register_type r) \ + : data(r) \ + { \ + } \ + operator register_type() const noexcept { return data; } \ + }; \ + }; \ + XSIMD_DECLARE_SIMD_REGISTER(T, vxe, __vector Tv) + + // The VXE vector intrinsics do not support long, unsigned long, + // and char data types. batches of these types are vectors of + // equivalent types. + XSIMD_DECLARE_SIMD_BOOL_VXE_REGISTER(signed char, signed char, char); + XSIMD_DECLARE_SIMD_BOOL_VXE_REGISTER(unsigned char, unsigned char, char); +#ifdef __CHAR_UNSIGNED__ + XSIMD_DECLARE_SIMD_BOOL_VXE_REGISTER(char, unsigned char, char); +#else + XSIMD_DECLARE_SIMD_BOOL_VXE_REGISTER(char, signed char, char); +#endif + XSIMD_DECLARE_SIMD_BOOL_VXE_REGISTER(unsigned short, unsigned short, short); + XSIMD_DECLARE_SIMD_BOOL_VXE_REGISTER(short, short, short); + XSIMD_DECLARE_SIMD_BOOL_VXE_REGISTER(unsigned int, unsigned int, int); + XSIMD_DECLARE_SIMD_BOOL_VXE_REGISTER(int, int, int); + XSIMD_DECLARE_SIMD_BOOL_VXE_REGISTER(unsigned long, unsigned long long, long long); + XSIMD_DECLARE_SIMD_BOOL_VXE_REGISTER(long, long long, long long); + XSIMD_DECLARE_SIMD_BOOL_VXE_REGISTER(float, float, int); + XSIMD_DECLARE_SIMD_BOOL_VXE_REGISTER(double, double, long long); + +#undef XSIMD_DECLARE_SIMD_BOOL_VXE_REGISTER + } +#endif +} + +#endif diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index e430901f7..feb0e6edb 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -108,6 +108,8 @@ if (CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID MATCHES "GNU" # Nothing specific elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "powerpc") # Nothing specific + elseif(${CMAKE_SYSTEM_PROCESSOR} MATCHES "s390x") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=z14 -mzvector") elseif(NOT WIN32 AND NOT EMSCRIPTEN) if(NOT CMAKE_CXX_FLAGS MATCHES "-march" AND NOT CMAKE_CXX_FLAGS MATCHES "-arch" AND NOT CMAKE_OSX_ARCHITECTURES) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=${TARGET_ARCH}") diff --git a/test/check_arch.sh b/test/check_arch.sh index b8e6686a6..d26ed9d4b 100644 --- a/test/check_arch.sh +++ b/test/check_arch.sh @@ -87,3 +87,4 @@ znver3 znver4 btver1 btver2 +z14