Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

WIP: clang build fixes and workarounds #16

Draft
wants to merge 13 commits into
base: master
Choose a base branch
from
184 changes: 104 additions & 80 deletions experimental/bits/simd.h
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,12 @@ using __m512d [[__gnu__::__vector_size__(64)]] = double;
using __m512i [[__gnu__::__vector_size__(64)]] = long long;
#endif

#if defined __clang__
template<typename T> auto __builtin_ia32_ps256_ps (T x) { return __builtin_shufflevector(x, _mm_setzero_ps() , 0, 1, 2, 3, 4, 4, 4, 4); }
template<typename T> auto __builtin_ia32_ps512_ps (T x) { return __builtin_shufflevector(x, _mm_setzero_ps() , 0, 1, 2, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4); }
template<typename T> auto __builtin_ia32_ps512_256ps(T x) { return __builtin_shufflevector(x, _mm256_setzero_ps(), 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 8, 8, 8, 8, 8, 8); }
#endif

// __next_power_of_2{{{
/**
* \internal
Expand Down Expand Up @@ -178,7 +184,7 @@ using __value_type_or_identity_t
// }}}
// __is_vectorizable {{{
template <typename _Tp>
struct __is_vectorizable : public std::is_arithmetic<_Tp>
struct __is_vectorizable : public std::is_arithmetic<std::remove_reference_t<_Tp>>
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

why is this needed? _Tp should never be a reference. And references are not vectorizable. (Pointers might be - needs a proposal)

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

clang by some reason returns long long&& out of operator[] in my example:
https://godbolt.org/z/WJN7_M

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Then whatever calls __is_vectorizable<decltype(x[0])> is incorrect.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, thus a number of remove_references will be required to workaround that thing across other places.

{
};
template <> struct __is_vectorizable<bool> : public false_type
Expand Down Expand Up @@ -381,7 +387,12 @@ __is_neon_abi()

// }}}
// __make_dependent_t {{{
template <typename, typename _Up> using __make_dependent_t = _Up;
template <typename, typename _Up> struct __make_dependent
{
using type = _Up;
};
template <typename _Tp, typename _Up>
using __make_dependent_t = typename __make_dependent<_Tp, _Up>::type;

// }}}
// ^^^ ---- type traits ---- ^^^
Expand Down Expand Up @@ -1039,7 +1050,7 @@ template <size_t _Np, bool _Sanitized> struct _BitMask
"not implemented for bitmasks larger than one ullong");
if constexpr (_NewSize == 1) // must sanitize because the return _Tp is bool
return _SanitizedBitMask<1>{
{static_cast<bool>(_M_bits[0] & (_Tp(1) << _DropLsb))}};
(static_cast<bool>(_M_bits[0] & (_Tp(1) << _DropLsb)))};
else
return _BitMask<_NewSize,
((_NewSize + _DropLsb == sizeof(_Tp) * CHAR_BIT
Expand Down Expand Up @@ -1285,7 +1296,7 @@ struct __vector_type_n<_Tp, _Np,
static constexpr size_t _Bytes = _Np * sizeof(_Tp) < __min_vector_size<_Tp>
? __min_vector_size<_Tp>
: __next_power_of_2(_Np * sizeof(_Tp));
using type [[__gnu__::__vector_size__(_Bytes)]] = _Tp;
using type [[__gnu__::__vector_size__(_Bytes)]] = std::remove_reference_t<_Tp>;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

again, _Tp should never be a reference

};

template <typename _Tp, size_t _Bytes, size_t = _Bytes % sizeof(_Tp)>
Expand Down Expand Up @@ -1339,7 +1350,7 @@ template <typename _Tp>
struct _VectorTraitsImpl<_Tp, enable_if_t<__is_vector_type_v<_Tp>>>
{
using type = _Tp;
using value_type = decltype(std::declval<_Tp>()[0]);
using value_type = std::remove_reference_t<decltype(std::declval<_Tp>()[0])>;
static constexpr int _S_width = sizeof(_Tp) / sizeof(value_type);
using _Wrapper = _SimdWrapper<value_type, _S_width>;
template <typename _Up, int _W = _S_width>
Expand Down Expand Up @@ -1559,62 +1570,77 @@ __generate_vector(_Gp&& __gen)

// }}}
// __xor{{{
template <typename _Tp, typename _TVT = _VectorTraits<_Tp>, typename... _Dummy>
_GLIBCXX_SIMD_INTRINSIC constexpr _Tp
__xor(_Tp __a, typename _TVT::type __b, _Dummy...) noexcept
template <typename _TW>
_GLIBCXX_SIMD_INTRINSIC constexpr _TW
__xor(_TW __a, _TW __b) noexcept
{
static_assert(sizeof...(_Dummy) == 0);
using _Up = typename _TVT::value_type;
using _Ip = make_unsigned_t<__int_for_sizeof_t<_Up>>;
return __vector_bitcast<_Up>(__vector_bitcast<_Ip>(__a)
^ __vector_bitcast<_Ip>(__b));
}

template <typename _Tp, typename = decltype(_Tp() ^ _Tp())>
_GLIBCXX_SIMD_INTRINSIC constexpr _Tp
__xor(_Tp __a, _Tp __b) noexcept
{
return __a ^ __b;
if constexpr (__is_vector_type_v<_TW> || __is_simd_wrapper_v<_TW>)
{
using _TVT = _VectorTraits<_TW>;
using _Tp = typename _TVT::value_type;
if constexpr (std::is_floating_point_v<_Tp>)
{
using _Ip = make_unsigned_t<__int_for_sizeof_t<_Tp>>;
return __vector_bitcast<_Tp>(__vector_bitcast<_Ip>(__a)
^ __vector_bitcast<_Ip>(__b));
}
else if constexpr (__is_vector_type_v<_TW>)
return __a ^ __b;
else
return __a._M_data ^ __b._M_data;
}
else
return __a ^ __b;
}

// }}}
// __or{{{
template <typename _Tp, typename _TVT = _VectorTraits<_Tp>, typename... _Dummy>
_GLIBCXX_SIMD_INTRINSIC constexpr _Tp
__or(_Tp __a, typename _TVT::type __b, _Dummy...) noexcept
{
static_assert(sizeof...(_Dummy) == 0);
using _Up = typename _TVT::value_type;
using _Ip = make_unsigned_t<__int_for_sizeof_t<_Up>>;
return __vector_bitcast<_Up>(__vector_bitcast<_Ip>(__a)
| __vector_bitcast<_Ip>(__b));
}

template <typename _Tp, typename = decltype(_Tp() | _Tp())>
_GLIBCXX_SIMD_INTRINSIC constexpr _Tp
__or(_Tp __a, _Tp __b) noexcept
template <typename _TW>
_GLIBCXX_SIMD_INTRINSIC constexpr _TW
__or(_TW __a, _TW __b) noexcept
{
return __a | __b;
if constexpr (__is_vector_type_v<_TW> || __is_simd_wrapper_v<_TW>)
{
using _TVT = _VectorTraits<_TW>;
using _Tp = typename _TVT::value_type;
if constexpr (std::is_floating_point_v<_Tp>)
{
using _Ip = make_unsigned_t<__int_for_sizeof_t<_Tp>>;
return __vector_bitcast<_Tp>(__vector_bitcast<_Ip>(__a)
| __vector_bitcast<_Ip>(__b));
}
else if constexpr (__is_vector_type_v<_TW>)
return __a | __b;
else
return __a._M_data | __b._M_data;
}
else
return __a | __b;
}

// }}}
// __and{{{
template <typename _Tp, typename _TVT = _VectorTraits<_Tp>, typename... _Dummy>
_GLIBCXX_SIMD_INTRINSIC constexpr _Tp
__and(_Tp __a, typename _TVT::type __b, _Dummy...) noexcept
{
static_assert(sizeof...(_Dummy) == 0);
using _Up = typename _TVT::value_type;
using _Ip = make_unsigned_t<__int_for_sizeof_t<_Up>>;
return __vector_bitcast<_Up>(__vector_bitcast<_Ip>(__a)
& __vector_bitcast<_Ip>(__b));
}

template <typename _Tp, typename = decltype(_Tp() & _Tp())>
_GLIBCXX_SIMD_INTRINSIC constexpr _Tp
__and(_Tp __a, _Tp __b) noexcept
template <typename _TW>
_GLIBCXX_SIMD_INTRINSIC constexpr _TW
__and(_TW __a, _TW __b) noexcept
{
return __a & __b;
if constexpr (__is_vector_type_v<_TW> || __is_simd_wrapper_v<_TW>)
{
using _TVT = _VectorTraits<_TW>;
using _Tp = typename _TVT::value_type;
if constexpr (std::is_floating_point_v<_Tp>)
{
using _Ip = make_unsigned_t<__int_for_sizeof_t<_Tp>>;
return __vector_bitcast<_Tp>(__vector_bitcast<_Ip>(__a)
& __vector_bitcast<_Ip>(__b));
}
else if constexpr (__is_vector_type_v<_TW>)
return __a & __b;
else
return __a._M_data & __b._M_data;
}
else
return __a & __b;
}

// }}}
Expand Down Expand Up @@ -1680,38 +1706,37 @@ static constexpr struct
} _S_x86_andnot;
#endif // _GLIBCXX_SIMD_X86INTRIN && !__clang__

template <typename _Tp, typename _TVT = _VectorTraits<_Tp>, typename... _Dummy>
_GLIBCXX_SIMD_INTRINSIC constexpr _Tp
__andnot(_Tp __a, typename _TVT::type __b, _Dummy...) noexcept
template <typename _TW>
_GLIBCXX_SIMD_INTRINSIC constexpr _TW
__andnot(_TW __a, _TW __b) noexcept
{
static_assert(sizeof...(_Dummy) == 0);
#if _GLIBCXX_SIMD_X86INTRIN && !defined __clang__
if constexpr (sizeof(_Tp) >= 16)
if constexpr (__is_vector_type_v<_TW> || __is_simd_wrapper_v<_TW>)
{
const auto __ai = __to_intrin(__a);
const auto __bi = __to_intrin(__b);
if (!__builtin_is_constant_evaluated()
&& !(__builtin_constant_p(__ai) && __builtin_constant_p(__bi)))
using _TVT = _VectorTraits<_TW>;
using _Tp = typename _TVT::value_type;
using _TV = typename _TVT::type;
#if _GLIBCXX_SIMD_X86INTRIN && !defined __clang__
if constexpr (sizeof(_TW) >= 16)
{
const auto __r = _S_x86_andnot(__ai, __bi);
if constexpr (is_convertible_v<decltype(__r), _Tp>)
return __r;
else
return reinterpret_cast<_Tp>(__r);
const auto __ai = __to_intrin(__a);
const auto __bi = __to_intrin(__b);
if (!__builtin_is_constant_evaluated()
&& !(__builtin_constant_p(__ai) && __builtin_constant_p(__bi)))
{
const auto __r = _S_x86_andnot(__ai, __bi);
if constexpr (is_convertible_v<decltype(__r), _TW>)
return __r;
else
return reinterpret_cast<_TV>(__r);
}
}
}
#endif // _GLIBCXX_SIMD_X86INTRIN
using _Up = typename _TVT::value_type;
using _Ip = make_unsigned_t<__int_for_sizeof_t<_Up>>;
return __vector_bitcast<_Up>(~__vector_bitcast<_Ip>(__a)
& __vector_bitcast<_Ip>(__b));
}

template <typename _Tp, typename = decltype(~_Tp() & _Tp())>
_GLIBCXX_SIMD_INTRINSIC constexpr _Tp
__andnot(_Tp __a, _Tp __b) noexcept
{
return ~__a & __b;
using _Ip = make_unsigned_t<__int_for_sizeof_t<_Tp>>;
return __vector_bitcast<_Tp>(~__vector_bitcast<_Ip>(__a)
& __vector_bitcast<_Ip>(__b));
}
else
return ~__a & __b;
}

// }}}
Expand Down Expand Up @@ -2068,7 +2093,7 @@ struct __intrinsic_type<
static constexpr std::size_t _VBytes
= _Bytes <= 16 ? 16 : _Bytes <= 32 ? 32 : 64;
using type [[__gnu__::__vector_size__(_VBytes)]]
= std::conditional_t<std::is_integral_v<_Tp>, long long int, _Tp>;
= std::conditional_t<std::is_integral_v<std::remove_reference_t<_Tp>>, long long int, std::remove_reference_t<_Tp>>;
};
#endif // _GLIBCXX_SIMD_HAVE_SSE

Expand Down Expand Up @@ -3559,8 +3584,7 @@ split(const simd_mask<typename _V::simd_type::value_type, _Ap>& __x)

// }}}
// split<_Sizes...>(simd) {{{
template <size_t... _Sizes, typename _Tp, typename _Ap,
typename = enable_if_t<((_Sizes + ...) == simd<_Tp, _Ap>::size())>>
template <size_t... _Sizes, typename _Tp, typename _Ap, typename>
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

removing the SFINAE condition breaks the spec

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

SFINAE is there in declaration. Here it is definition. clang says it is redefinition of default argument

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Declaration is at line 3314

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK thanks. I'll take a look

_GLIBCXX_SIMD_ALWAYS_INLINE
std::tuple<simd<_Tp, simd_abi::deduce_t<_Tp, _Sizes>>...>
split(const simd<_Tp, _Ap>& __x)
Expand Down
48 changes: 28 additions & 20 deletions experimental/bits/simd_builtin.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,16 +37,17 @@
_GLIBCXX_SIMD_BEGIN_NAMESPACE
// _S_allbits{{{
template <typename _V>
static inline constexpr _V _S_allbits
static inline _GLIBCXX_SIMD_USE_CONSTEXPR _V _S_allbits
= reinterpret_cast<_V>(~__vector_type_t<char, sizeof(_V) / sizeof(char)>());

// }}}
// _S_signmask, _S_absmask{{{
template <typename _V, typename = _VectorTraits<_V>>
static inline constexpr _V _S_signmask = __xor(_V() + 1, _V() - 1);
static inline _GLIBCXX_SIMD_USE_CONSTEXPR _V _S_signmask = __xor(_V() + 1, _V() - 1);
//__andnot(_S_signmask<_V>, _S_allbits<_V>) does not work in Clang for unknown reason
template <typename _V, typename = _VectorTraits<_V>>
static inline constexpr _V _S_absmask
= __andnot(_S_signmask<_V>, _S_allbits<_V>);
static inline _GLIBCXX_SIMD_USE_CONSTEXPR _V _S_absmask
= __andnot(_S_signmask<_V>, reinterpret_cast<_V>(~__vector_type_t<char, sizeof(_V) / sizeof(char)>()));

//}}}
// __vector_permute<Indices...>{{{
Expand Down Expand Up @@ -566,6 +567,16 @@ __convert(_From __v0, _More... __vs)

// }}}
// __convert_all{{{
template< std::size_t _Np, typename _ToT, typename _R >
struct __make_array_impl {
template< typename T >
auto operator() ( std::initializer_list<T> __xs ) {
return __call_with_subscripts(
__xs.begin(), std::make_index_sequence<_Np>(),
[](auto... __ys) { return _R{__vector_bitcast<_ToT>(__ys)...}; });
}
};

// Converts __v into std::array<_To, N>, where N is _NParts if non-zero or
// otherwise deduced from _To such that N * #elements(_To) <= #elements(__v).
// Note: this function may return less than all converted elements
Expand Down Expand Up @@ -624,11 +635,8 @@ __convert_all(_From __v)
return __vector_bitcast<_FromT, decltype(__n)::value>(__vv);
};
[[maybe_unused]] const auto __vi = __to_intrin(__v);
auto&& __make_array = [](std::initializer_list<auto> __xs) {
return __call_with_subscripts(
__xs.begin(), std::make_index_sequence<_Np>(),
[](auto... __ys) { return _R{__vector_bitcast<_ToT>(__ys)...}; });
};

auto&& __make_array = __make_array_impl<_Np, _ToT, _R>{};

if constexpr (_Np == 0)
return _R{};
Expand Down Expand Up @@ -1659,19 +1667,19 @@ template <typename _Abi> struct _SimdImplBuiltin
_GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
__bit_and(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
{
return __and(__x._M_data, __y._M_data);
return __and(__x, __y);
}
template <typename _Tp, size_t _Np>
_GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
__bit_or(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
{
return __or(__x._M_data, __y._M_data);
return __or(__x, __y);
}
template <typename _Tp, size_t _Np>
_GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
__bit_xor(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
{
return __xor(__x._M_data, __y._M_data);
return __xor(__x, __y);
}
template <typename _Tp, size_t _Np>
_GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
Expand Down Expand Up @@ -1750,7 +1758,11 @@ template <typename _Abi> struct _SimdImplBuiltin
_GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
__negate(_SimdWrapper<_Tp, _Np> __x) noexcept
{
#if !defined __clang__
return __vector_bitcast<_Tp>(!__x._M_data);
#else
return __vector_bitcast<_Tp>(__x._M_data == decltype(__x._M_data){});
#endif
}

// __min, __max, __minmax {{{2
Expand Down Expand Up @@ -2115,7 +2127,7 @@ template <typename _Abi> struct _SimdImplBuiltin
const _V __absx = __and(__x, _S_absmask<_V>);
static_assert(CHAR_BIT * sizeof(1ull)
>= std::numeric_limits<value_type>::digits);
constexpr _V __shifter_abs
_GLIBCXX_SIMD_USE_CONSTEXPR _V __shifter_abs
= _V() + (1ull << (std::numeric_limits<value_type>::digits - 1));
const _V __shifter = __or(__and(_S_signmask<_V>, __x), __shifter_abs);
_V __shifted = __x + __shifter;
Expand Down Expand Up @@ -2720,7 +2732,7 @@ template <typename _Abi> struct _MaskImplBuiltin : _MaskImplBuiltinMixin
__bit_not(const _SimdWrapper<_Tp, _Np>& __x)
{
if constexpr(_Abi::_S_is_partial)
return __andnot(__x._M_data, _Abi::template __implicit_mask<_Tp>());
return __andnot(__x, _Abi::template __implicit_mask<_Tp>());
else
return __not(__x._M_data);
}
Expand Down Expand Up @@ -2794,13 +2806,9 @@ template <typename _Abi> struct _MaskImplBuiltin : _MaskImplBuiltinMixin
if (__builtin_constant_p(__rhs))
{
if (__rhs == false)
{
__lhs = __andnot(__k._M_data, __lhs._M_data);
}
__lhs = __andnot(__k, __lhs);
else
{
__lhs = __or(__k._M_data, __lhs._M_data);
}
__lhs = __or(__k, __lhs);
return;
}
__lhs
Expand Down
Loading