libstdc++
simd_builtin.h
1 // Simd Abi specific implementations -*- C++ -*-
2 
3 // Copyright (C) 2020-2023 Free Software Foundation, Inc.
4 //
5 // This file is part of the GNU ISO C++ Library. This library is free
6 // software; you can redistribute it and/or modify it under the
7 // terms of the GNU General Public License as published by the
8 // Free Software Foundation; either version 3, or (at your option)
9 // any later version.
10 
11 // This library is distributed in the hope that it will be useful,
12 // but WITHOUT ANY WARRANTY; without even the implied warranty of
13 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 // GNU General Public License for more details.
15 
16 // Under Section 7 of GPL version 3, you are granted additional
17 // permissions described in the GCC Runtime Library Exception, version
18 // 3.1, as published by the Free Software Foundation.
19 
20 // You should have received a copy of the GNU General Public License and
21 // a copy of the GCC Runtime Library Exception along with this program;
22 // see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
23 // <http://www.gnu.org/licenses/>.
24 
25 #ifndef _GLIBCXX_EXPERIMENTAL_SIMD_ABIS_H_
26 #define _GLIBCXX_EXPERIMENTAL_SIMD_ABIS_H_
27 
28 #if __cplusplus >= 201703L
29 
30 #include <array>
31 #include <cmath>
32 #include <cstdlib>
33 
34 _GLIBCXX_SIMD_BEGIN_NAMESPACE
35 // _S_allbits{{{
36 template <typename _V>
37  static inline _GLIBCXX_SIMD_USE_CONSTEXPR _V _S_allbits
38  = reinterpret_cast<_V>(~__vector_type_t<char, sizeof(_V) / sizeof(char)>());
39 
40 // }}}
41 // _S_signmask, _S_absmask{{{
42 template <typename _V, typename = _VectorTraits<_V>>
43  static inline _GLIBCXX_SIMD_USE_CONSTEXPR _V _S_signmask
44  = __xor(_V() + 1, _V() - 1);
45 
46 template <typename _V, typename = _VectorTraits<_V>>
47  static inline _GLIBCXX_SIMD_USE_CONSTEXPR _V _S_absmask
48  = __andnot(_S_signmask<_V>, _S_allbits<_V>);
49 
50 //}}}
51 // __vector_permute<Indices...>{{{
52 // Index == -1 requests zeroing of the output element
53 template <int... _Indices, typename _Tp, typename _TVT = _VectorTraits<_Tp>,
54  typename = __detail::__odr_helper>
55  constexpr _Tp
56  __vector_permute(_Tp __x)
57  {
58  static_assert(sizeof...(_Indices) == _TVT::_S_full_size);
59  return __make_vector<typename _TVT::value_type>(
60  (_Indices == -1 ? 0 : __x[_Indices == -1 ? 0 : _Indices])...);
61  }
62 
63 // }}}
64 // __vector_shuffle<Indices...>{{{
65 // Index == -1 requests zeroing of the output element
66 template <int... _Indices, typename _Tp, typename _TVT = _VectorTraits<_Tp>,
67  typename = __detail::__odr_helper>
68  constexpr _Tp
69  __vector_shuffle(_Tp __x, _Tp __y)
70  {
71  return _Tp{(_Indices == -1 ? 0
72  : _Indices < _TVT::_S_full_size
73  ? __x[_Indices]
74  : __y[_Indices - _TVT::_S_full_size])...};
75  }
76 
77 // }}}
78 // __make_wrapper{{{
79 template <typename _Tp, typename... _Args>
80  _GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper<_Tp, sizeof...(_Args)>
81  __make_wrapper(const _Args&... __args)
82  { return __make_vector<_Tp>(__args...); }
83 
84 // }}}
85 // __wrapper_bitcast{{{
86 template <typename _Tp, size_t _ToN = 0, typename _Up, size_t _M,
87  size_t _Np = _ToN != 0 ? _ToN : sizeof(_Up) * _M / sizeof(_Tp)>
88  _GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper<_Tp, _Np>
89  __wrapper_bitcast(_SimdWrapper<_Up, _M> __x)
90  {
91  static_assert(_Np > 1);
92  return __intrin_bitcast<__vector_type_t<_Tp, _Np>>(__x._M_data);
93  }
94 
95 // }}}
96 // __shift_elements_right{{{
97 // if (__shift % 2ⁿ == 0) => the low n Bytes are correct
98 template <unsigned __shift, typename _Tp, typename _TVT = _VectorTraits<_Tp>>
99  _GLIBCXX_SIMD_INTRINSIC _Tp
100  __shift_elements_right(_Tp __v)
101  {
102  [[maybe_unused]] const auto __iv = __to_intrin(__v);
103  static_assert(__shift <= sizeof(_Tp));
104  if constexpr (__shift == 0)
105  return __v;
106  else if constexpr (__shift == sizeof(_Tp))
107  return _Tp();
108 #if _GLIBCXX_SIMD_X86INTRIN // {{{
109  else if constexpr (__have_sse && __shift == 8
110  && _TVT::template _S_is<float, 4>)
111  return _mm_movehl_ps(__iv, __iv);
112  else if constexpr (__have_sse2 && __shift == 8
113  && _TVT::template _S_is<double, 2>)
114  return _mm_unpackhi_pd(__iv, __iv);
115  else if constexpr (__have_sse2 && sizeof(_Tp) == 16)
116  return reinterpret_cast<typename _TVT::type>(
117  _mm_srli_si128(reinterpret_cast<__m128i>(__iv), __shift));
118  else if constexpr (__shift == 16 && sizeof(_Tp) == 32)
119  {
120  /*if constexpr (__have_avx && _TVT::template _S_is<double, 4>)
121  return _mm256_permute2f128_pd(__iv, __iv, 0x81);
122  else if constexpr (__have_avx && _TVT::template _S_is<float, 8>)
123  return _mm256_permute2f128_ps(__iv, __iv, 0x81);
124  else if constexpr (__have_avx)
125  return reinterpret_cast<typename _TVT::type>(
126  _mm256_permute2f128_si256(__iv, __iv, 0x81));
127  else*/
128  return __zero_extend(__hi128(__v));
129  }
130  else if constexpr (__have_avx2 && sizeof(_Tp) == 32 && __shift < 16)
131  {
132  const auto __vll = __vector_bitcast<_LLong>(__v);
133  return reinterpret_cast<typename _TVT::type>(
134  _mm256_alignr_epi8(_mm256_permute2x128_si256(__vll, __vll, 0x81),
135  __vll, __shift));
136  }
137  else if constexpr (__have_avx && sizeof(_Tp) == 32 && __shift < 16)
138  {
139  const auto __vll = __vector_bitcast<_LLong>(__v);
140  return reinterpret_cast<typename _TVT::type>(
141  __concat(_mm_alignr_epi8(__hi128(__vll), __lo128(__vll), __shift),
142  _mm_srli_si128(__hi128(__vll), __shift)));
143  }
144  else if constexpr (sizeof(_Tp) == 32 && __shift > 16)
145  return __zero_extend(__shift_elements_right<__shift - 16>(__hi128(__v)));
146  else if constexpr (sizeof(_Tp) == 64 && __shift == 32)
147  return __zero_extend(__hi256(__v));
148  else if constexpr (__have_avx512f && sizeof(_Tp) == 64)
149  {
150  if constexpr (__shift >= 48)
151  return __zero_extend(
152  __shift_elements_right<__shift - 48>(__extract<3, 4>(__v)));
153  else if constexpr (__shift >= 32)
154  return __zero_extend(
155  __shift_elements_right<__shift - 32>(__hi256(__v)));
156  else if constexpr (__shift % 8 == 0)
157  return reinterpret_cast<typename _TVT::type>(
158  _mm512_alignr_epi64(__m512i(), __intrin_bitcast<__m512i>(__v),
159  __shift / 8));
160  else if constexpr (__shift % 4 == 0)
161  return reinterpret_cast<typename _TVT::type>(
162  _mm512_alignr_epi32(__m512i(), __intrin_bitcast<__m512i>(__v),
163  __shift / 4));
164  else if constexpr (__have_avx512bw && __shift < 16)
165  {
166  const auto __vll = __vector_bitcast<_LLong>(__v);
167  return reinterpret_cast<typename _TVT::type>(
168  _mm512_alignr_epi8(_mm512_shuffle_i32x4(__vll, __vll, 0xf9),
169  __vll, __shift));
170  }
171  else if constexpr (__have_avx512bw && __shift < 32)
172  {
173  const auto __vll = __vector_bitcast<_LLong>(__v);
174  return reinterpret_cast<typename _TVT::type>(
175  _mm512_alignr_epi8(_mm512_shuffle_i32x4(__vll, __m512i(), 0xee),
176  _mm512_shuffle_i32x4(__vll, __vll, 0xf9),
177  __shift - 16));
178  }
179  else
180  __assert_unreachable<_Tp>();
181  }
182  /*
183  } else if constexpr (__shift % 16 == 0 && sizeof(_Tp) == 64)
184  return __auto_bitcast(__extract<__shift / 16, 4>(__v));
185  */
186 #endif // _GLIBCXX_SIMD_X86INTRIN }}}
187  else
188  {
189  constexpr int __chunksize = __shift % 8 == 0 ? 8
190  : __shift % 4 == 0 ? 4
191  : __shift % 2 == 0 ? 2
192  : 1;
193  auto __w = __vector_bitcast<__int_with_sizeof_t<__chunksize>>(__v);
194  using _Up = decltype(__w);
195  return __intrin_bitcast<_Tp>(
196  __call_with_n_evaluations<(sizeof(_Tp) - __shift) / __chunksize>(
197  [](auto... __chunks) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
198  return _Up{__chunks...};
199  }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
200  return __w[__shift / __chunksize + __i];
201  }));
202  }
203  }
204 
205 // }}}
206 // __extract_part(_SimdWrapper<_Tp, _Np>) {{{
207 template <int _Index, int _Total, int _Combine, typename _Tp, size_t _Np>
208  _GLIBCXX_SIMD_INTRINSIC _GLIBCXX_CONST constexpr
209  _SimdWrapper<_Tp, _Np / _Total * _Combine>
210  __extract_part(const _SimdWrapper<_Tp, _Np> __x)
211  {
212  if constexpr (_Index % 2 == 0 && _Total % 2 == 0 && _Combine % 2 == 0)
213  return __extract_part<_Index / 2, _Total / 2, _Combine / 2>(__x);
214  else
215  {
216  constexpr size_t __values_per_part = _Np / _Total;
217  constexpr size_t __values_to_skip = _Index * __values_per_part;
218  constexpr size_t __return_size = __values_per_part * _Combine;
219  using _R = __vector_type_t<_Tp, __return_size>;
220  static_assert((_Index + _Combine) * __values_per_part * sizeof(_Tp)
221  <= sizeof(__x),
222  "out of bounds __extract_part");
223  // the following assertion would ensure no "padding" to be read
224  // static_assert(_Total >= _Index + _Combine, "_Total must be greater
225  // than _Index");
226 
227  // static_assert(__return_size * _Total == _Np, "_Np must be divisible
228  // by _Total");
229  if (__x._M_is_constprop())
230  return __generate_from_n_evaluations<__return_size, _R>(
231  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
232  return __x[__values_to_skip + __i];
233  });
234  if constexpr (_Index == 0 && _Total == 1)
235  return __x;
236  else if constexpr (_Index == 0)
237  return __intrin_bitcast<_R>(__as_vector(__x));
238 #if _GLIBCXX_SIMD_X86INTRIN // {{{
239  else if constexpr (sizeof(__x) == 32
240  && __return_size * sizeof(_Tp) <= 16)
241  {
242  constexpr size_t __bytes_to_skip = __values_to_skip * sizeof(_Tp);
243  if constexpr (__bytes_to_skip == 16)
244  return __vector_bitcast<_Tp, __return_size>(
245  __hi128(__as_vector(__x)));
246  else
247  return __vector_bitcast<_Tp, __return_size>(
248  _mm_alignr_epi8(__hi128(__vector_bitcast<_LLong>(__x)),
249  __lo128(__vector_bitcast<_LLong>(__x)),
250  __bytes_to_skip));
251  }
252 #endif // _GLIBCXX_SIMD_X86INTRIN }}}
253  else if constexpr (_Index > 0
254  && (__values_to_skip % __return_size != 0
255  || sizeof(_R) >= 8)
256  && (__values_to_skip + __return_size) * sizeof(_Tp)
257  <= 64
258  && sizeof(__x) >= 16)
259  return __intrin_bitcast<_R>(
260  __shift_elements_right<__values_to_skip * sizeof(_Tp)>(
261  __as_vector(__x)));
262  else
263  {
264  _R __r = {};
265  __builtin_memcpy(&__r,
266  reinterpret_cast<const char*>(&__x)
267  + sizeof(_Tp) * __values_to_skip,
268  __return_size * sizeof(_Tp));
269  return __r;
270  }
271  }
272  }
273 
274 // }}}
275 // __extract_part(_SimdWrapper<bool, _Np>) {{{
276 template <int _Index, int _Total, int _Combine = 1, size_t _Np>
277  _GLIBCXX_SIMD_INTRINSIC constexpr _SimdWrapper<bool, _Np / _Total * _Combine>
278  __extract_part(const _SimdWrapper<bool, _Np> __x)
279  {
280  static_assert(_Combine == 1, "_Combine != 1 not implemented");
281  static_assert(__have_avx512f && _Np == _Np);
282  static_assert(_Total >= 2 && _Index + _Combine <= _Total && _Index >= 0);
283  return __x._M_data >> (_Index * _Np / _Total);
284  }
285 
286 // }}}
287 
288 // __vector_convert {{{
289 // implementation requires an index sequence
290 template <typename _To, typename _From, size_t... _I>
291  _GLIBCXX_SIMD_INTRINSIC constexpr _To
292  __vector_convert(_From __a, index_sequence<_I...>)
293  {
294  using _Tp = typename _VectorTraits<_To>::value_type;
295  return _To{static_cast<_Tp>(__a[_I])...};
296  }
297 
298 template <typename _To, typename _From, size_t... _I>
299  _GLIBCXX_SIMD_INTRINSIC constexpr _To
300  __vector_convert(_From __a, _From __b, index_sequence<_I...>)
301  {
302  using _Tp = typename _VectorTraits<_To>::value_type;
303  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...};
304  }
305 
306 template <typename _To, typename _From, size_t... _I>
307  _GLIBCXX_SIMD_INTRINSIC constexpr _To
308  __vector_convert(_From __a, _From __b, _From __c, index_sequence<_I...>)
309  {
310  using _Tp = typename _VectorTraits<_To>::value_type;
311  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
312  static_cast<_Tp>(__c[_I])...};
313  }
314 
315 template <typename _To, typename _From, size_t... _I>
316  _GLIBCXX_SIMD_INTRINSIC constexpr _To
317  __vector_convert(_From __a, _From __b, _From __c, _From __d,
318  index_sequence<_I...>)
319  {
320  using _Tp = typename _VectorTraits<_To>::value_type;
321  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
322  static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...};
323  }
324 
325 template <typename _To, typename _From, size_t... _I>
326  _GLIBCXX_SIMD_INTRINSIC constexpr _To
327  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
328  index_sequence<_I...>)
329  {
330  using _Tp = typename _VectorTraits<_To>::value_type;
331  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
332  static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
333  static_cast<_Tp>(__e[_I])...};
334  }
335 
336 template <typename _To, typename _From, size_t... _I>
337  _GLIBCXX_SIMD_INTRINSIC constexpr _To
338  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
339  _From __f, index_sequence<_I...>)
340  {
341  using _Tp = typename _VectorTraits<_To>::value_type;
342  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
343  static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
344  static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...};
345  }
346 
347 template <typename _To, typename _From, size_t... _I>
348  _GLIBCXX_SIMD_INTRINSIC constexpr _To
349  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
350  _From __f, _From __g, index_sequence<_I...>)
351  {
352  using _Tp = typename _VectorTraits<_To>::value_type;
353  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
354  static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
355  static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
356  static_cast<_Tp>(__g[_I])...};
357  }
358 
359 template <typename _To, typename _From, size_t... _I>
360  _GLIBCXX_SIMD_INTRINSIC constexpr _To
361  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
362  _From __f, _From __g, _From __h, index_sequence<_I...>)
363  {
364  using _Tp = typename _VectorTraits<_To>::value_type;
365  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
366  static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
367  static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
368  static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...};
369  }
370 
371 template <typename _To, typename _From, size_t... _I>
372  _GLIBCXX_SIMD_INTRINSIC constexpr _To
373  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
374  _From __f, _From __g, _From __h, _From __i,
375  index_sequence<_I...>)
376  {
377  using _Tp = typename _VectorTraits<_To>::value_type;
378  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
379  static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
380  static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
381  static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
382  static_cast<_Tp>(__i[_I])...};
383  }
384 
385 template <typename _To, typename _From, size_t... _I>
386  _GLIBCXX_SIMD_INTRINSIC constexpr _To
387  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
388  _From __f, _From __g, _From __h, _From __i, _From __j,
389  index_sequence<_I...>)
390  {
391  using _Tp = typename _VectorTraits<_To>::value_type;
392  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
393  static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
394  static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
395  static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
396  static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...};
397  }
398 
399 template <typename _To, typename _From, size_t... _I>
400  _GLIBCXX_SIMD_INTRINSIC constexpr _To
401  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
402  _From __f, _From __g, _From __h, _From __i, _From __j,
403  _From __k, index_sequence<_I...>)
404  {
405  using _Tp = typename _VectorTraits<_To>::value_type;
406  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
407  static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
408  static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
409  static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
410  static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...,
411  static_cast<_Tp>(__k[_I])...};
412  }
413 
414 template <typename _To, typename _From, size_t... _I>
415  _GLIBCXX_SIMD_INTRINSIC constexpr _To
416  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
417  _From __f, _From __g, _From __h, _From __i, _From __j,
418  _From __k, _From __l, index_sequence<_I...>)
419  {
420  using _Tp = typename _VectorTraits<_To>::value_type;
421  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
422  static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
423  static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
424  static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
425  static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...,
426  static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])...};
427  }
428 
429 template <typename _To, typename _From, size_t... _I>
430  _GLIBCXX_SIMD_INTRINSIC constexpr _To
431  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
432  _From __f, _From __g, _From __h, _From __i, _From __j,
433  _From __k, _From __l, _From __m, index_sequence<_I...>)
434  {
435  using _Tp = typename _VectorTraits<_To>::value_type;
436  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
437  static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
438  static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
439  static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
440  static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...,
441  static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])...,
442  static_cast<_Tp>(__m[_I])...};
443  }
444 
445 template <typename _To, typename _From, size_t... _I>
446  _GLIBCXX_SIMD_INTRINSIC constexpr _To
447  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
448  _From __f, _From __g, _From __h, _From __i, _From __j,
449  _From __k, _From __l, _From __m, _From __n,
450  index_sequence<_I...>)
451  {
452  using _Tp = typename _VectorTraits<_To>::value_type;
453  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
454  static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
455  static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
456  static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
457  static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...,
458  static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])...,
459  static_cast<_Tp>(__m[_I])..., static_cast<_Tp>(__n[_I])...};
460  }
461 
462 template <typename _To, typename _From, size_t... _I>
463  _GLIBCXX_SIMD_INTRINSIC constexpr _To
464  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
465  _From __f, _From __g, _From __h, _From __i, _From __j,
466  _From __k, _From __l, _From __m, _From __n, _From __o,
467  index_sequence<_I...>)
468  {
469  using _Tp = typename _VectorTraits<_To>::value_type;
470  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
471  static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
472  static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
473  static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
474  static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...,
475  static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])...,
476  static_cast<_Tp>(__m[_I])..., static_cast<_Tp>(__n[_I])...,
477  static_cast<_Tp>(__o[_I])...};
478  }
479 
480 template <typename _To, typename _From, size_t... _I>
481  _GLIBCXX_SIMD_INTRINSIC constexpr _To
482  __vector_convert(_From __a, _From __b, _From __c, _From __d, _From __e,
483  _From __f, _From __g, _From __h, _From __i, _From __j,
484  _From __k, _From __l, _From __m, _From __n, _From __o,
485  _From __p, index_sequence<_I...>)
486  {
487  using _Tp = typename _VectorTraits<_To>::value_type;
488  return _To{static_cast<_Tp>(__a[_I])..., static_cast<_Tp>(__b[_I])...,
489  static_cast<_Tp>(__c[_I])..., static_cast<_Tp>(__d[_I])...,
490  static_cast<_Tp>(__e[_I])..., static_cast<_Tp>(__f[_I])...,
491  static_cast<_Tp>(__g[_I])..., static_cast<_Tp>(__h[_I])...,
492  static_cast<_Tp>(__i[_I])..., static_cast<_Tp>(__j[_I])...,
493  static_cast<_Tp>(__k[_I])..., static_cast<_Tp>(__l[_I])...,
494  static_cast<_Tp>(__m[_I])..., static_cast<_Tp>(__n[_I])...,
495  static_cast<_Tp>(__o[_I])..., static_cast<_Tp>(__p[_I])...};
496  }
497 
498 // Defer actual conversion to the overload that takes an index sequence. Note
499 // that this function adds zeros or drops values off the end if you don't ensure
500 // matching width.
501 template <typename _To, typename... _From, size_t _FromSize>
502  _GLIBCXX_SIMD_INTRINSIC constexpr _To
503  __vector_convert(_SimdWrapper<_From, _FromSize>... __xs)
504  {
505 #ifdef _GLIBCXX_SIMD_WORKAROUND_PR85048
506  using _From0 = __first_of_pack_t<_From...>;
507  using _FW = _SimdWrapper<_From0, _FromSize>;
508  if (!_FW::_S_is_partial && !(... && __xs._M_is_constprop()))
509  {
510  if constexpr ((sizeof...(_From) & (sizeof...(_From) - 1))
511  == 0) // power-of-two number of arguments
512  return __convert_x86<_To>(__as_vector(__xs)...);
513  else // append zeros and recurse until the above branch is taken
514  return __vector_convert<_To>(__xs..., _FW{});
515  }
516  else
517 #endif
518  return __vector_convert<_To>(
519  __as_vector(__xs)...,
520  make_index_sequence<(sizeof...(__xs) == 1 ? std::min(
521  _VectorTraits<_To>::_S_full_size, int(_FromSize))
522  : _FromSize)>());
523  }
524 
525 // }}}
526 // __convert function{{{
527 template <typename _To, typename _From, typename... _More>
528  _GLIBCXX_SIMD_INTRINSIC constexpr auto
529  __convert(_From __v0, _More... __vs)
530  {
531  static_assert((true && ... && is_same_v<_From, _More>) );
532  if constexpr (__is_vectorizable_v<_From>)
533  {
534  using _V = typename _VectorTraits<_To>::type;
535  using _Tp = typename _VectorTraits<_To>::value_type;
536  return _V{static_cast<_Tp>(__v0), static_cast<_Tp>(__vs)...};
537  }
538  else if constexpr (__is_vector_type_v<_From>)
539  return __convert<_To>(__as_wrapper(__v0), __as_wrapper(__vs)...);
540  else // _SimdWrapper arguments
541  {
542  constexpr size_t __input_size = _From::_S_size * (1 + sizeof...(_More));
543  if constexpr (__is_vectorizable_v<_To>)
544  return __convert<__vector_type_t<_To, __input_size>>(__v0, __vs...);
545  else if constexpr (!__is_vector_type_v<_To>)
546  return _To(__convert<typename _To::_BuiltinType>(__v0, __vs...));
547  else
548  {
549  static_assert(
550  sizeof...(_More) == 0
551  || _VectorTraits<_To>::_S_full_size >= __input_size,
552  "__convert(...) requires the input to fit into the output");
553  return __vector_convert<_To>(__v0, __vs...);
554  }
555  }
556  }
557 
558 // }}}
559 // __convert_all{{{
560 // Converts __v into array<_To, N>, where N is _NParts if non-zero or
561 // otherwise deduced from _To such that N * #elements(_To) <= #elements(__v).
562 // Note: this function may return less than all converted elements
563 template <typename _To,
564  size_t _NParts = 0, // allows to convert fewer or more (only last
565  // _To, to be partially filled) than all
566  size_t _Offset = 0, // where to start, # of elements (not Bytes or
567  // Parts)
568  typename _From, typename _FromVT = _VectorTraits<_From>>
569  _GLIBCXX_SIMD_INTRINSIC auto
570  __convert_all(_From __v)
571  {
572  if constexpr (is_arithmetic_v<_To> && _NParts != 1)
573  {
574  static_assert(_Offset < _FromVT::_S_full_size);
575  constexpr auto _Np
576  = _NParts == 0 ? _FromVT::_S_partial_width - _Offset : _NParts;
577  return __generate_from_n_evaluations<_Np, array<_To, _Np>>(
578  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
579  return static_cast<_To>(__v[__i + _Offset]);
580  });
581  }
582  else
583  {
584  static_assert(__is_vector_type_v<_To>);
585  using _ToVT = _VectorTraits<_To>;
586  if constexpr (__is_vector_type_v<_From>)
587  return __convert_all<_To, _NParts>(__as_wrapper(__v));
588  else if constexpr (_NParts == 1)
589  {
590  static_assert(_Offset % _ToVT::_S_full_size == 0);
591  return array<_To, 1>{__vector_convert<_To>(
592  __extract_part<_Offset / _ToVT::_S_full_size,
593  __div_roundup(_FromVT::_S_partial_width,
594  _ToVT::_S_full_size)>(__v))};
595  }
596 #if _GLIBCXX_SIMD_X86INTRIN // {{{
597  else if constexpr (!__have_sse4_1 && _Offset == 0
598  && is_integral_v<typename _FromVT::value_type>
599  && sizeof(typename _FromVT::value_type)
600  < sizeof(typename _ToVT::value_type)
601  && !(sizeof(typename _FromVT::value_type) == 4
602  && is_same_v<typename _ToVT::value_type, double>))
603  {
604  using _ToT = typename _ToVT::value_type;
605  using _FromT = typename _FromVT::value_type;
606  constexpr size_t _Np
607  = _NParts != 0
608  ? _NParts
609  : (_FromVT::_S_partial_width / _ToVT::_S_full_size);
610  using _R = array<_To, _Np>;
611  // __adjust modifies its input to have _Np (use _SizeConstant)
612  // entries so that no unnecessary intermediate conversions are
613  // requested and, more importantly, no intermediate conversions are
614  // missing
615  [[maybe_unused]] auto __adjust
616  = [](auto __n,
617  auto __vv) -> _SimdWrapper<_FromT, decltype(__n)::value> {
618  return __vector_bitcast<_FromT, decltype(__n)::value>(__vv);
619  };
620  [[maybe_unused]] const auto __vi = __to_intrin(__v);
621  auto&& __make_array
622  = [](auto __x0, [[maybe_unused]] auto __x1) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
623  if constexpr (_Np == 1)
624  return _R{__intrin_bitcast<_To>(__x0)};
625  else
626  return _R{__intrin_bitcast<_To>(__x0),
627  __intrin_bitcast<_To>(__x1)};
628  };
629 
630  if constexpr (_Np == 0)
631  return _R{};
632  else if constexpr (sizeof(_FromT) == 1 && sizeof(_ToT) == 2)
633  {
634  static_assert(is_integral_v<_FromT>);
635  static_assert(is_integral_v<_ToT>);
636  if constexpr (is_unsigned_v<_FromT>)
637  return __make_array(_mm_unpacklo_epi8(__vi, __m128i()),
638  _mm_unpackhi_epi8(__vi, __m128i()));
639  else
640  return __make_array(
641  _mm_srai_epi16(_mm_unpacklo_epi8(__vi, __vi), 8),
642  _mm_srai_epi16(_mm_unpackhi_epi8(__vi, __vi), 8));
643  }
644  else if constexpr (sizeof(_FromT) == 2 && sizeof(_ToT) == 4)
645  {
646  static_assert(is_integral_v<_FromT>);
647  if constexpr (is_floating_point_v<_ToT>)
648  {
649  const auto __ints
650  = __convert_all<__vector_type16_t<int>, _Np>(
651  __adjust(_SizeConstant<_Np * 4>(), __v));
652  return __generate_from_n_evaluations<_Np, _R>(
653  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
654  return __vector_convert<_To>(__as_wrapper(__ints[__i]));
655  });
656  }
657  else if constexpr (is_unsigned_v<_FromT>)
658  return __make_array(_mm_unpacklo_epi16(__vi, __m128i()),
659  _mm_unpackhi_epi16(__vi, __m128i()));
660  else
661  return __make_array(
662  _mm_srai_epi32(_mm_unpacklo_epi16(__vi, __vi), 16),
663  _mm_srai_epi32(_mm_unpackhi_epi16(__vi, __vi), 16));
664  }
665  else if constexpr (sizeof(_FromT) == 4 && sizeof(_ToT) == 8
666  && is_integral_v<_FromT> && is_integral_v<_ToT>)
667  {
668  if constexpr (is_unsigned_v<_FromT>)
669  return __make_array(_mm_unpacklo_epi32(__vi, __m128i()),
670  _mm_unpackhi_epi32(__vi, __m128i()));
671  else
672  return __make_array(
673  _mm_unpacklo_epi32(__vi, _mm_srai_epi32(__vi, 31)),
674  _mm_unpackhi_epi32(__vi, _mm_srai_epi32(__vi, 31)));
675  }
676  else if constexpr (sizeof(_FromT) == 4 && sizeof(_ToT) == 8
677  && is_integral_v<_FromT> && is_integral_v<_ToT>)
678  {
679  if constexpr (is_unsigned_v<_FromT>)
680  return __make_array(_mm_unpacklo_epi32(__vi, __m128i()),
681  _mm_unpackhi_epi32(__vi, __m128i()));
682  else
683  return __make_array(
684  _mm_unpacklo_epi32(__vi, _mm_srai_epi32(__vi, 31)),
685  _mm_unpackhi_epi32(__vi, _mm_srai_epi32(__vi, 31)));
686  }
687  else if constexpr (sizeof(_FromT) == 1 && sizeof(_ToT) >= 4
688  && is_signed_v<_FromT>)
689  {
690  const __m128i __vv[2] = {_mm_unpacklo_epi8(__vi, __vi),
691  _mm_unpackhi_epi8(__vi, __vi)};
692  const __vector_type_t<int, 4> __vvvv[4] = {
693  __vector_bitcast<int>(_mm_unpacklo_epi16(__vv[0], __vv[0])),
694  __vector_bitcast<int>(_mm_unpackhi_epi16(__vv[0], __vv[0])),
695  __vector_bitcast<int>(_mm_unpacklo_epi16(__vv[1], __vv[1])),
696  __vector_bitcast<int>(_mm_unpackhi_epi16(__vv[1], __vv[1]))};
697  if constexpr (sizeof(_ToT) == 4)
698  return __generate_from_n_evaluations<_Np, _R>(
699  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
700  return __vector_convert<_To>(
701  _SimdWrapper<int, 4>(__vvvv[__i] >> 24));
702  });
703  else if constexpr (is_integral_v<_ToT>)
704  return __generate_from_n_evaluations<_Np, _R>(
705  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
706  const auto __signbits = __to_intrin(__vvvv[__i / 2] >> 31);
707  const auto __sx32 = __to_intrin(__vvvv[__i / 2] >> 24);
708  return __vector_bitcast<_ToT>(
709  __i % 2 == 0 ? _mm_unpacklo_epi32(__sx32, __signbits)
710  : _mm_unpackhi_epi32(__sx32, __signbits));
711  });
712  else
713  return __generate_from_n_evaluations<_Np, _R>(
714  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
715  const _SimdWrapper<int, 4> __int4 = __vvvv[__i / 2] >> 24;
716  return __vector_convert<_To>(
717  __i % 2 == 0 ? __int4
718  : _SimdWrapper<int, 4>(
719  _mm_unpackhi_epi64(__to_intrin(__int4),
720  __to_intrin(__int4))));
721  });
722  }
723  else if constexpr (sizeof(_FromT) == 1 && sizeof(_ToT) == 4)
724  {
725  const auto __shorts = __convert_all<__vector_type16_t<
726  conditional_t<is_signed_v<_FromT>, short, unsigned short>>>(
727  __adjust(_SizeConstant<(_Np + 1) / 2 * 8>(), __v));
728  return __generate_from_n_evaluations<_Np, _R>(
729  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
730  return __convert_all<_To>(__shorts[__i / 2])[__i % 2];
731  });
732  }
733  else if constexpr (sizeof(_FromT) == 2 && sizeof(_ToT) == 8
734  && is_signed_v<_FromT> && is_integral_v<_ToT>)
735  {
736  const __m128i __vv[2] = {_mm_unpacklo_epi16(__vi, __vi),
737  _mm_unpackhi_epi16(__vi, __vi)};
738  const __vector_type16_t<int> __vvvv[4]
739  = {__vector_bitcast<int>(
740  _mm_unpacklo_epi32(_mm_srai_epi32(__vv[0], 16),
741  _mm_srai_epi32(__vv[0], 31))),
742  __vector_bitcast<int>(
743  _mm_unpackhi_epi32(_mm_srai_epi32(__vv[0], 16),
744  _mm_srai_epi32(__vv[0], 31))),
745  __vector_bitcast<int>(
746  _mm_unpacklo_epi32(_mm_srai_epi32(__vv[1], 16),
747  _mm_srai_epi32(__vv[1], 31))),
748  __vector_bitcast<int>(
749  _mm_unpackhi_epi32(_mm_srai_epi32(__vv[1], 16),
750  _mm_srai_epi32(__vv[1], 31)))};
751  return __generate_from_n_evaluations<_Np, _R>(
752  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
753  return __vector_bitcast<_ToT>(__vvvv[__i]);
754  });
755  }
756  else if constexpr (sizeof(_FromT) <= 2 && sizeof(_ToT) == 8)
757  {
758  const auto __ints
759  = __convert_all<__vector_type16_t<conditional_t<
760  is_signed_v<_FromT> || is_floating_point_v<_ToT>, int,
761  unsigned int>>>(
762  __adjust(_SizeConstant<(_Np + 1) / 2 * 4>(), __v));
763  return __generate_from_n_evaluations<_Np, _R>(
764  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
765  return __convert_all<_To>(__ints[__i / 2])[__i % 2];
766  });
767  }
768  else
769  __assert_unreachable<_To>();
770  }
771 #endif // _GLIBCXX_SIMD_X86INTRIN }}}
772  else if constexpr ((_FromVT::_S_partial_width - _Offset)
773  > _ToVT::_S_full_size)
774  {
775  /*
776  static_assert(
777  (_FromVT::_S_partial_width & (_FromVT::_S_partial_width - 1)) ==
778  0,
779  "__convert_all only supports power-of-2 number of elements.
780  Otherwise " "the return type cannot be array<_To, N>.");
781  */
782  constexpr size_t _NTotal
783  = (_FromVT::_S_partial_width - _Offset) / _ToVT::_S_full_size;
784  constexpr size_t _Np = _NParts == 0 ? _NTotal : _NParts;
785  static_assert(
786  _Np <= _NTotal
787  || (_Np == _NTotal + 1
788  && (_FromVT::_S_partial_width - _Offset) % _ToVT::_S_full_size
789  > 0));
790  using _R = array<_To, _Np>;
791  if constexpr (_Np == 1)
792  return _R{__vector_convert<_To>(
793  __extract_part<_Offset, _FromVT::_S_partial_width,
794  _ToVT::_S_full_size>(__v))};
795  else
796  return __generate_from_n_evaluations<_Np, _R>(
797  [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
798  auto __part
799  = __extract_part<__i * _ToVT::_S_full_size + _Offset,
800  _FromVT::_S_partial_width,
801  _ToVT::_S_full_size>(__v);
802  return __vector_convert<_To>(__part);
803  });
804  }
805  else if constexpr (_Offset == 0)
806  return array<_To, 1>{__vector_convert<_To>(__v)};
807  else
808  return array<_To, 1>{__vector_convert<_To>(
809  __extract_part<_Offset, _FromVT::_S_partial_width,
810  _FromVT::_S_partial_width - _Offset>(__v))};
811  }
812  }
813 
814 // }}}
815 
816 // _GnuTraits {{{
817 template <typename _Tp, typename _Mp, typename _Abi, size_t _Np>
818  struct _GnuTraits
819  {
820  using _IsValid = true_type;
821  using _SimdImpl = typename _Abi::_SimdImpl;
822  using _MaskImpl = typename _Abi::_MaskImpl;
823 
824  // simd and simd_mask member types {{{
825  using _SimdMember = _SimdWrapper<_Tp, _Np>;
826  using _MaskMember = _SimdWrapper<_Mp, _Np>;
827  static constexpr size_t _S_simd_align = alignof(_SimdMember);
828  static constexpr size_t _S_mask_align = alignof(_MaskMember);
829 
830  // }}}
831  // size metadata {{{
832  static constexpr size_t _S_full_size = _SimdMember::_S_full_size;
833  static constexpr bool _S_is_partial = _SimdMember::_S_is_partial;
834 
835  // }}}
836  // _SimdBase / base class for simd, providing extra conversions {{{
837  struct _SimdBase2
838  {
839  _GLIBCXX_SIMD_ALWAYS_INLINE explicit
840  operator __intrinsic_type_t<_Tp, _Np>() const
841  { return __to_intrin(static_cast<const simd<_Tp, _Abi>*>(this)->_M_data); }
842 
843  _GLIBCXX_SIMD_ALWAYS_INLINE explicit
844  operator __vector_type_t<_Tp, _Np>() const
845  { return static_cast<const simd<_Tp, _Abi>*>(this)->_M_data.__builtin(); }
846  };
847 
848  struct _SimdBase1
849  {
850  _GLIBCXX_SIMD_ALWAYS_INLINE explicit
851  operator __intrinsic_type_t<_Tp, _Np>() const
852  { return __data(*static_cast<const simd<_Tp, _Abi>*>(this)); }
853  };
854 
855  using _SimdBase = conditional_t<
856  is_same<__intrinsic_type_t<_Tp, _Np>, __vector_type_t<_Tp, _Np>>::value,
857  _SimdBase1, _SimdBase2>;
858 
859  // }}}
860  // _MaskBase {{{
861  struct _MaskBase2
862  {
863  _GLIBCXX_SIMD_ALWAYS_INLINE explicit
864  operator __intrinsic_type_t<_Tp, _Np>() const
865  { return static_cast<const simd_mask<_Tp, _Abi>*>(this) ->_M_data.__intrin(); }
866 
867  _GLIBCXX_SIMD_ALWAYS_INLINE explicit
868  operator __vector_type_t<_Tp, _Np>() const
869  { return static_cast<const simd_mask<_Tp, _Abi>*>(this)->_M_data._M_data; }
870  };
871 
872  struct _MaskBase1
873  {
874  _GLIBCXX_SIMD_ALWAYS_INLINE explicit
875  operator __intrinsic_type_t<_Tp, _Np>() const
876  { return __data(*static_cast<const simd_mask<_Tp, _Abi>*>(this)); }
877  };
878 
879  using _MaskBase = conditional_t<
880  is_same<__intrinsic_type_t<_Tp, _Np>, __vector_type_t<_Tp, _Np>>::value,
881  _MaskBase1, _MaskBase2>;
882 
883  // }}}
884  // _MaskCastType {{{
885  // parameter type of one explicit simd_mask constructor
886  class _MaskCastType
887  {
888  using _Up = __intrinsic_type_t<_Tp, _Np>;
889  _Up _M_data;
890 
891  public:
892  _GLIBCXX_SIMD_ALWAYS_INLINE
893  _MaskCastType(_Up __x) : _M_data(__x) {}
894 
895  _GLIBCXX_SIMD_ALWAYS_INLINE
896  operator _MaskMember() const { return _M_data; }
897  };
898 
899  // }}}
900  // _SimdCastType {{{
901  // parameter type of one explicit simd constructor
902  class _SimdCastType1
903  {
904  using _Ap = __intrinsic_type_t<_Tp, _Np>;
905  _SimdMember _M_data;
906 
907  public:
908  _GLIBCXX_SIMD_ALWAYS_INLINE constexpr
909  _SimdCastType1(_Ap __a) : _M_data(__vector_bitcast<_Tp>(__a)) {}
910 
911  _GLIBCXX_SIMD_ALWAYS_INLINE constexpr
912  operator _SimdMember() const { return _M_data; }
913  };
914 
915  class _SimdCastType2
916  {
917  using _Ap = __intrinsic_type_t<_Tp, _Np>;
918  using _Bp = __vector_type_t<_Tp, _Np>;
919  _SimdMember _M_data;
920 
921  public:
922  _GLIBCXX_SIMD_ALWAYS_INLINE constexpr
923  _SimdCastType2(_Ap __a) : _M_data(__vector_bitcast<_Tp>(__a)) {}
924 
925  _GLIBCXX_SIMD_ALWAYS_INLINE constexpr
926  _SimdCastType2(_Bp __b) : _M_data(__b) {}
927 
928  _GLIBCXX_SIMD_ALWAYS_INLINE constexpr
929  operator _SimdMember() const { return _M_data; }
930  };
931 
932  using _SimdCastType = conditional_t<
933  is_same<__intrinsic_type_t<_Tp, _Np>, __vector_type_t<_Tp, _Np>>::value,
934  _SimdCastType1, _SimdCastType2>;
935  //}}}
936  };
937 
938 // }}}
939 struct _CommonImplX86;
940 struct _CommonImplNeon;
941 struct _CommonImplBuiltin;
942 template <typename _Abi, typename = __detail::__odr_helper> struct _SimdImplBuiltin;
943 template <typename _Abi, typename = __detail::__odr_helper> struct _MaskImplBuiltin;
944 template <typename _Abi, typename = __detail::__odr_helper> struct _SimdImplX86;
945 template <typename _Abi, typename = __detail::__odr_helper> struct _MaskImplX86;
946 template <typename _Abi, typename = __detail::__odr_helper> struct _SimdImplNeon;
947 template <typename _Abi, typename = __detail::__odr_helper> struct _MaskImplNeon;
948 template <typename _Abi, typename = __detail::__odr_helper> struct _SimdImplPpc;
949 template <typename _Abi, typename = __detail::__odr_helper> struct _MaskImplPpc;
950 
951 // simd_abi::_VecBuiltin {{{
952 template <int _UsedBytes>
953  struct simd_abi::_VecBuiltin
954  {
955  template <typename _Tp>
956  static constexpr size_t _S_size = _UsedBytes / sizeof(_Tp);
957 
958  // validity traits {{{
959  struct _IsValidAbiTag : __bool_constant<(_UsedBytes > 1)> {};
960 
961  template <typename _Tp>
962  struct _IsValidSizeFor
963  : __bool_constant<(_UsedBytes / sizeof(_Tp) > 1
964  && _UsedBytes % sizeof(_Tp) == 0
965  && _UsedBytes <= __vectorized_sizeof<_Tp>()
966  && (!__have_avx512f || _UsedBytes <= 32))> {};
967 
968  template <typename _Tp>
969  struct _IsValid : conjunction<_IsValidAbiTag, __is_vectorizable<_Tp>,
970  _IsValidSizeFor<_Tp>> {};
971 
972  template <typename _Tp>
973  static constexpr bool _S_is_valid_v = _IsValid<_Tp>::value;
974 
975  // }}}
976  // _SimdImpl/_MaskImpl {{{
977 #if _GLIBCXX_SIMD_X86INTRIN
978  using _CommonImpl = _CommonImplX86;
979  using _SimdImpl = _SimdImplX86<_VecBuiltin<_UsedBytes>>;
980  using _MaskImpl = _MaskImplX86<_VecBuiltin<_UsedBytes>>;
981 #elif _GLIBCXX_SIMD_HAVE_NEON
982  using _CommonImpl = _CommonImplNeon;
983  using _SimdImpl = _SimdImplNeon<_VecBuiltin<_UsedBytes>>;
984  using _MaskImpl = _MaskImplNeon<_VecBuiltin<_UsedBytes>>;
985 #else
986  using _CommonImpl = _CommonImplBuiltin;
987 #ifdef __ALTIVEC__
988  using _SimdImpl = _SimdImplPpc<_VecBuiltin<_UsedBytes>>;
989  using _MaskImpl = _MaskImplPpc<_VecBuiltin<_UsedBytes>>;
990 #else
991  using _SimdImpl = _SimdImplBuiltin<_VecBuiltin<_UsedBytes>>;
992  using _MaskImpl = _MaskImplBuiltin<_VecBuiltin<_UsedBytes>>;
993 #endif
994 #endif
995 
996  // }}}
997  // __traits {{{
998  template <typename _Tp>
999  using _MaskValueType = __int_for_sizeof_t<_Tp>;
1000 
1001  template <typename _Tp>
1002  using __traits
1003  = conditional_t<_S_is_valid_v<_Tp>,
1004  _GnuTraits<_Tp, _MaskValueType<_Tp>,
1005  _VecBuiltin<_UsedBytes>, _S_size<_Tp>>,
1006  _InvalidTraits>;
1007 
1008  //}}}
1009  // size metadata {{{
1010  template <typename _Tp>
1011  static constexpr size_t _S_full_size = __traits<_Tp>::_S_full_size;
1012 
1013  template <typename _Tp>
1014  static constexpr bool _S_is_partial = __traits<_Tp>::_S_is_partial;
1015 
1016  // }}}
1017  // implicit masks {{{
1018  template <typename _Tp>
1019  using _MaskMember = _SimdWrapper<_MaskValueType<_Tp>, _S_size<_Tp>>;
1020 
1021  template <typename _Tp>
1022  _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
1023  _S_implicit_mask()
1024  {
1025  using _UV = typename _MaskMember<_Tp>::_BuiltinType;
1026  if constexpr (!_MaskMember<_Tp>::_S_is_partial)
1027  return ~_UV();
1028  else
1029  {
1030  constexpr auto __size = _S_size<_Tp>;
1031  _GLIBCXX_SIMD_USE_CONSTEXPR auto __r
1032  = __generate_vector<_UV>([](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
1033  { return __i < __size ? -1 : 0; });
1034  return __r;
1035  }
1036  }
1037 
1038  template <typename _Tp>
1039  _GLIBCXX_SIMD_INTRINSIC static constexpr __intrinsic_type_t<_Tp, _S_size<_Tp>>
1040  _S_implicit_mask_intrin()
1041  { return __to_intrin(__vector_bitcast<_Tp>(_S_implicit_mask<_Tp>()._M_data)); }
1042 
1043  template <typename _TW, typename _TVT = _VectorTraits<_TW>>
1044  _GLIBCXX_SIMD_INTRINSIC static constexpr _TW
1045  _S_masked(_TW __x)
1046  {
1047  using _Tp = typename _TVT::value_type;
1048  if constexpr (!_MaskMember<_Tp>::_S_is_partial)
1049  return __x;
1050  else
1051  return __and(__as_vector(__x),
1052  __vector_bitcast<_Tp>(_S_implicit_mask<_Tp>()));
1053  }
1054 
1055  template <typename _TW, typename _TVT = _VectorTraits<_TW>>
1056  _GLIBCXX_SIMD_INTRINSIC static constexpr auto
1057  __make_padding_nonzero(_TW __x)
1058  {
1059  using _Tp = typename _TVT::value_type;
1060  if constexpr (!_S_is_partial<_Tp>)
1061  return __x;
1062  else
1063  {
1064  _GLIBCXX_SIMD_USE_CONSTEXPR auto __implicit_mask
1065  = __vector_bitcast<_Tp>(_S_implicit_mask<_Tp>());
1066  if constexpr (is_integral_v<_Tp>)
1067  return __or(__x, ~__implicit_mask);
1068  else
1069  {
1070  _GLIBCXX_SIMD_USE_CONSTEXPR auto __one
1071  = __andnot(__implicit_mask,
1072  __vector_broadcast<_S_full_size<_Tp>>(_Tp(1)));
1073  // it's not enough to return `x | 1_in_padding` because the
1074  // padding in x might be inf or nan (independent of
1075  // __FINITE_MATH_ONLY__, because it's about padding bits)
1076  return __or(__and(__x, __implicit_mask), __one);
1077  }
1078  }
1079  }
1080  // }}}
1081  };
1082 
1083 // }}}
1084 // simd_abi::_VecBltnBtmsk {{{
1085 template <int _UsedBytes>
1086  struct simd_abi::_VecBltnBtmsk
1087  {
1088  template <typename _Tp>
1089  static constexpr size_t _S_size = _UsedBytes / sizeof(_Tp);
1090 
1091  // validity traits {{{
1092  struct _IsValidAbiTag : __bool_constant<(_UsedBytes > 1)> {};
1093 
1094  template <typename _Tp>
1095  struct _IsValidSizeFor
1096  : __bool_constant<(_UsedBytes / sizeof(_Tp) > 1
1097  && _UsedBytes % sizeof(_Tp) == 0 && _UsedBytes <= 64
1098  && (_UsedBytes > 32 || __have_avx512vl))> {};
1099 
1100  // Bitmasks require at least AVX512F. If sizeof(_Tp) < 4 the AVX512BW is also
1101  // required.
1102  template <typename _Tp>
1103  struct _IsValid
1104  : conjunction<
1105  _IsValidAbiTag, __bool_constant<__have_avx512f>,
1106  __bool_constant<__have_avx512bw || (sizeof(_Tp) >= 4)>,
1107  __bool_constant<(__vectorized_sizeof<_Tp>() > sizeof(_Tp))>,
1108  _IsValidSizeFor<_Tp>> {};
1109 
1110  template <typename _Tp>
1111  static constexpr bool _S_is_valid_v = _IsValid<_Tp>::value;
1112 
1113  // }}}
1114  // simd/_MaskImpl {{{
1115  #if _GLIBCXX_SIMD_X86INTRIN
1116  using _CommonImpl = _CommonImplX86;
1117  using _SimdImpl = _SimdImplX86<_VecBltnBtmsk<_UsedBytes>>;
1118  using _MaskImpl = _MaskImplX86<_VecBltnBtmsk<_UsedBytes>>;
1119  #else
1120  template <int>
1121  struct _MissingImpl;
1122 
1123  using _CommonImpl = _MissingImpl<_UsedBytes>;
1124  using _SimdImpl = _MissingImpl<_UsedBytes>;
1125  using _MaskImpl = _MissingImpl<_UsedBytes>;
1126  #endif
1127 
1128  // }}}
1129  // __traits {{{
1130  template <typename _Tp>
1131  using _MaskMember = _SimdWrapper<bool, _S_size<_Tp>>;
1132 
1133  template <typename _Tp>
1134  using __traits = conditional_t<
1135  _S_is_valid_v<_Tp>,
1136  _GnuTraits<_Tp, bool, _VecBltnBtmsk<_UsedBytes>, _S_size<_Tp>>,
1137  _InvalidTraits>;
1138 
1139  //}}}
1140  // size metadata {{{
1141  template <typename _Tp>
1142  static constexpr size_t _S_full_size = __traits<_Tp>::_S_full_size;
1143  template <typename _Tp>
1144  static constexpr bool _S_is_partial = __traits<_Tp>::_S_is_partial;
1145 
1146  // }}}
1147  // implicit mask {{{
1148  private:
1149  template <typename _Tp>
1150  using _ImplicitMask = _SimdWrapper<bool, _S_size<_Tp>>;
1151 
1152  public:
1153  template <size_t _Np>
1154  _GLIBCXX_SIMD_INTRINSIC static constexpr __bool_storage_member_type_t<_Np>
1155  __implicit_mask_n()
1156  {
1157  using _Tp = __bool_storage_member_type_t<_Np>;
1158  return _Np < sizeof(_Tp) * __CHAR_BIT__ ? _Tp((1ULL << _Np) - 1) : ~_Tp();
1159  }
1160 
1161  template <typename _Tp>
1162  _GLIBCXX_SIMD_INTRINSIC static constexpr _ImplicitMask<_Tp>
1163  _S_implicit_mask()
1164  { return __implicit_mask_n<_S_size<_Tp>>(); }
1165 
1166  template <typename _Tp>
1167  _GLIBCXX_SIMD_INTRINSIC static constexpr __bool_storage_member_type_t<_S_size<_Tp>>
1168  _S_implicit_mask_intrin()
1169  { return __implicit_mask_n<_S_size<_Tp>>(); }
1170 
1171  template <typename _Tp, size_t _Np>
1172  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1173  _S_masked(_SimdWrapper<_Tp, _Np> __x)
1174  {
1175  if constexpr (is_same_v<_Tp, bool>)
1176  if constexpr (_Np < 8 || (_Np & (_Np - 1)) != 0)
1177  return _MaskImpl::_S_bit_and(
1178  __x, _SimdWrapper<_Tp, _Np>(
1179  __bool_storage_member_type_t<_Np>((1ULL << _Np) - 1)));
1180  else
1181  return __x;
1182  else
1183  return _S_masked(__x._M_data);
1184  }
1185 
1186  template <typename _TV>
1187  _GLIBCXX_SIMD_INTRINSIC static constexpr _TV
1188  _S_masked(_TV __x)
1189  {
1190  using _Tp = typename _VectorTraits<_TV>::value_type;
1191  static_assert(
1192  !__is_bitmask_v<_TV>,
1193  "_VecBltnBtmsk::_S_masked cannot work on bitmasks, since it doesn't "
1194  "know the number of elements. Use _SimdWrapper<bool, N> instead.");
1195  if constexpr (_S_is_partial<_Tp>)
1196  {
1197  constexpr size_t _Np = _S_size<_Tp>;
1198  return __make_dependent_t<_TV, _CommonImpl>::_S_blend(
1199  _S_implicit_mask<_Tp>(), _SimdWrapper<_Tp, _Np>(),
1200  _SimdWrapper<_Tp, _Np>(__x));
1201  }
1202  else
1203  return __x;
1204  }
1205 
1206  template <typename _TV, typename _TVT = _VectorTraits<_TV>>
1207  _GLIBCXX_SIMD_INTRINSIC static constexpr auto
1208  __make_padding_nonzero(_TV __x)
1209  {
1210  using _Tp = typename _TVT::value_type;
1211  if constexpr (!_S_is_partial<_Tp>)
1212  return __x;
1213  else
1214  {
1215  constexpr size_t _Np = _S_size<_Tp>;
1216  if constexpr (is_integral_v<typename _TVT::value_type>)
1217  return __x
1218  | __generate_vector<_Tp, _S_full_size<_Tp>>(
1219  [](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA -> _Tp {
1220  if (__i < _Np)
1221  return 0;
1222  else
1223  return 1;
1224  });
1225  else
1226  return __make_dependent_t<_TV, _CommonImpl>::_S_blend(
1227  _S_implicit_mask<_Tp>(),
1228  _SimdWrapper<_Tp, _Np>(
1229  __vector_broadcast<_S_full_size<_Tp>>(_Tp(1))),
1230  _SimdWrapper<_Tp, _Np>(__x))
1231  ._M_data;
1232  }
1233  }
1234 
1235  // }}}
1236  };
1237 
1238 //}}}
1239 // _CommonImplBuiltin {{{
1240 struct _CommonImplBuiltin
1241 {
1242  // _S_converts_via_decomposition{{{
1243  // This lists all cases where a __vector_convert needs to fall back to
1244  // conversion of individual scalars (i.e. decompose the input vector into
1245  // scalars, convert, compose output vector). In those cases, _S_masked_load &
1246  // _S_masked_store prefer to use the _S_bit_iteration implementation.
1247  template <typename _From, typename _To, size_t _ToSize>
1248  static inline constexpr bool __converts_via_decomposition_v
1249  = sizeof(_From) != sizeof(_To);
1250 
1251  // }}}
1252  // _S_load{{{
1253  template <typename _Tp, size_t _Np, size_t _Bytes = _Np * sizeof(_Tp)>
1254  _GLIBCXX_SIMD_INTRINSIC static __vector_type_t<_Tp, _Np>
1255  _S_load(const void* __p)
1256  {
1257  static_assert(_Np > 1);
1258  static_assert(_Bytes % sizeof(_Tp) == 0);
1259  using _Rp = __vector_type_t<_Tp, _Np>;
1260  if constexpr (sizeof(_Rp) == _Bytes)
1261  {
1262  _Rp __r;
1263  __builtin_memcpy(&__r, __p, _Bytes);
1264  return __r;
1265  }
1266  else
1267  {
1268 #ifdef _GLIBCXX_SIMD_WORKAROUND_PR90424
1269  using _Up = conditional_t<
1270  is_integral_v<_Tp>,
1271  conditional_t<_Bytes % 4 == 0,
1272  conditional_t<_Bytes % 8 == 0, long long, int>,
1273  conditional_t<_Bytes % 2 == 0, short, signed char>>,
1274  conditional_t<(_Bytes < 8 || _Np % 2 == 1 || _Np == 2), _Tp,
1275  double>>;
1276  using _V = __vector_type_t<_Up, _Np * sizeof(_Tp) / sizeof(_Up)>;
1277  if constexpr (sizeof(_V) != sizeof(_Rp))
1278  { // on i386 with 4 < _Bytes <= 8
1279  _Rp __r{};
1280  __builtin_memcpy(&__r, __p, _Bytes);
1281  return __r;
1282  }
1283  else
1284 #else // _GLIBCXX_SIMD_WORKAROUND_PR90424
1285  using _V = _Rp;
1286 #endif // _GLIBCXX_SIMD_WORKAROUND_PR90424
1287  {
1288  _V __r{};
1289  static_assert(_Bytes <= sizeof(_V));
1290  __builtin_memcpy(&__r, __p, _Bytes);
1291  return reinterpret_cast<_Rp>(__r);
1292  }
1293  }
1294  }
1295 
1296  // }}}
1297  // _S_store {{{
1298  template <size_t _ReqBytes = 0, typename _TV>
1299  _GLIBCXX_SIMD_INTRINSIC static void
1300  _S_store(_TV __x, void* __addr)
1301  {
1302  constexpr size_t _Bytes = _ReqBytes == 0 ? sizeof(__x) : _ReqBytes;
1303  static_assert(sizeof(__x) >= _Bytes);
1304 
1305  if constexpr (__is_vector_type_v<_TV>)
1306  {
1307  using _Tp = typename _VectorTraits<_TV>::value_type;
1308  constexpr size_t _Np = _Bytes / sizeof(_Tp);
1309  static_assert(_Np * sizeof(_Tp) == _Bytes);
1310 
1311 #ifdef _GLIBCXX_SIMD_WORKAROUND_PR90424
1312  using _Up = conditional_t<
1313  (is_integral_v<_Tp> || _Bytes < 4),
1314  conditional_t<(sizeof(__x) > sizeof(long long)), long long, _Tp>,
1315  float>;
1316  const auto __v = __vector_bitcast<_Up>(__x);
1317 #else // _GLIBCXX_SIMD_WORKAROUND_PR90424
1318  const __vector_type_t<_Tp, _Np> __v = __x;
1319 #endif // _GLIBCXX_SIMD_WORKAROUND_PR90424
1320 
1321  if constexpr ((_Bytes & (_Bytes - 1)) != 0)
1322  {
1323  constexpr size_t _MoreBytes = std::__bit_ceil(_Bytes);
1324  alignas(decltype(__v)) char __tmp[_MoreBytes];
1325  __builtin_memcpy(__tmp, &__v, _MoreBytes);
1326  __builtin_memcpy(__addr, __tmp, _Bytes);
1327  }
1328  else
1329  __builtin_memcpy(__addr, &__v, _Bytes);
1330  }
1331  else
1332  __builtin_memcpy(__addr, &__x, _Bytes);
1333  }
1334 
1335  template <typename _Tp, size_t _Np>
1336  _GLIBCXX_SIMD_INTRINSIC static void
1337  _S_store(_SimdWrapper<_Tp, _Np> __x, void* __addr)
1338  { _S_store<_Np * sizeof(_Tp)>(__x._M_data, __addr); }
1339 
1340  // }}}
1341  // _S_store_bool_array(_BitMask) {{{
1342  template <size_t _Np, bool _Sanitized>
1343  _GLIBCXX_SIMD_INTRINSIC static constexpr void
1344  _S_store_bool_array(_BitMask<_Np, _Sanitized> __x, bool* __mem)
1345  {
1346  if constexpr (_Np == 1)
1347  __mem[0] = __x[0];
1348  else if (__builtin_is_constant_evaluated())
1349  {
1350  for (size_t __i = 0; __i < _Np; ++__i)
1351  __mem[__i] = __x[__i];
1352  }
1353  else if constexpr (_Np == 2)
1354  {
1355  short __bool2 = (__x._M_to_bits() * 0x81) & 0x0101;
1356  _S_store<_Np>(__bool2, __mem);
1357  }
1358  else if constexpr (_Np == 3)
1359  {
1360  int __bool3 = (__x._M_to_bits() * 0x4081) & 0x010101;
1361  _S_store<_Np>(__bool3, __mem);
1362  }
1363  else
1364  {
1365  __execute_n_times<__div_roundup(_Np, 4)>(
1366  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1367  constexpr int __offset = __i * 4;
1368  constexpr int __remaining = _Np - __offset;
1369  if constexpr (__remaining > 4 && __remaining <= 7)
1370  {
1371  const _ULLong __bool7
1372  = (__x.template _M_extract<__offset>()._M_to_bits()
1373  * 0x40810204081ULL)
1374  & 0x0101010101010101ULL;
1375  _S_store<__remaining>(__bool7, __mem + __offset);
1376  }
1377  else if constexpr (__remaining >= 4)
1378  {
1379  int __bits = __x.template _M_extract<__offset>()._M_to_bits();
1380  if constexpr (__remaining > 7)
1381  __bits &= 0xf;
1382  const int __bool4 = (__bits * 0x204081) & 0x01010101;
1383  _S_store<4>(__bool4, __mem + __offset);
1384  }
1385  });
1386  }
1387  }
1388 
1389  // }}}
1390  // _S_blend{{{
1391  template <typename _Tp, size_t _Np>
1392  _GLIBCXX_SIMD_INTRINSIC static constexpr auto
1393  _S_blend(_SimdWrapper<__int_for_sizeof_t<_Tp>, _Np> __k,
1394  _SimdWrapper<_Tp, _Np> __at0, _SimdWrapper<_Tp, _Np> __at1)
1395  { return __k._M_data ? __at1._M_data : __at0._M_data; }
1396 
1397  // }}}
1398 };
1399 
1400 // }}}
1401 // _SimdImplBuiltin {{{1
1402 template <typename _Abi, typename>
1403  struct _SimdImplBuiltin
1404  {
1405  // member types {{{2
1406  template <typename _Tp>
1407  static constexpr size_t _S_max_store_size = 16;
1408 
1409  using abi_type = _Abi;
1410 
1411  template <typename _Tp>
1412  using _TypeTag = _Tp*;
1413 
1414  template <typename _Tp>
1415  using _SimdMember = typename _Abi::template __traits<_Tp>::_SimdMember;
1416 
1417  template <typename _Tp>
1418  using _MaskMember = typename _Abi::template _MaskMember<_Tp>;
1419 
1420  template <typename _Tp>
1421  static constexpr size_t _S_size = _Abi::template _S_size<_Tp>;
1422 
1423  template <typename _Tp>
1424  static constexpr size_t _S_full_size = _Abi::template _S_full_size<_Tp>;
1425 
1426  using _CommonImpl = typename _Abi::_CommonImpl;
1427  using _SuperImpl = typename _Abi::_SimdImpl;
1428  using _MaskImpl = typename _Abi::_MaskImpl;
1429 
1430  // _M_make_simd(_SimdWrapper/__intrinsic_type_t) {{{2
1431  template <typename _Tp, size_t _Np>
1432  _GLIBCXX_SIMD_INTRINSIC static constexpr simd<_Tp, _Abi>
1433  _M_make_simd(_SimdWrapper<_Tp, _Np> __x)
1434  { return {__private_init, __x}; }
1435 
1436  template <typename _Tp, size_t _Np>
1437  _GLIBCXX_SIMD_INTRINSIC static constexpr simd<_Tp, _Abi>
1438  _M_make_simd(__intrinsic_type_t<_Tp, _Np> __x)
1439  { return {__private_init, __vector_bitcast<_Tp>(__x)}; }
1440 
1441  // _S_broadcast {{{2
1442  template <typename _Tp>
1443  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdMember<_Tp>
1444  _S_broadcast(_Tp __x) noexcept
1445  { return __vector_broadcast<_S_full_size<_Tp>>(__x); }
1446 
1447  // _S_generator {{{2
1448  template <typename _Fp, typename _Tp>
1449  inline static constexpr _SimdMember<_Tp>
1450  _S_generator(_Fp&& __gen, _TypeTag<_Tp>)
1451  {
1452  return __generate_vector<_Tp, _S_full_size<_Tp>>(
1453  [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1454  if constexpr (__i < _S_size<_Tp>)
1455  return __gen(__i);
1456  else
1457  return 0;
1458  });
1459  }
1460 
1461  // _S_load {{{2
1462  template <typename _Tp, typename _Up>
1463  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdMember<_Tp>
1464  _S_load(const _Up* __mem, _TypeTag<_Tp>) noexcept
1465  {
1466  constexpr size_t _Np = _S_size<_Tp>;
1467  constexpr size_t __max_load_size
1468  = (sizeof(_Up) >= 4 && __have_avx512f) || __have_avx512bw ? 64
1469  : (is_floating_point_v<_Up> && __have_avx) || __have_avx2 ? 32
1470  : 16;
1471  constexpr size_t __bytes_to_load = sizeof(_Up) * _Np;
1472  if (__builtin_is_constant_evaluated())
1473  return __generate_vector<_Tp, _S_full_size<_Tp>>(
1474  [&](auto __i) constexpr {
1475  return static_cast<_Tp>(__i < _Np ? __mem[__i] : 0);
1476  });
1477  else if constexpr (sizeof(_Up) > 8)
1478  return __generate_vector<_Tp, _SimdMember<_Tp>::_S_full_size>(
1479  [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1480  return static_cast<_Tp>(__i < _Np ? __mem[__i] : 0);
1481  });
1482  else if constexpr (is_same_v<_Up, _Tp>)
1483  return _CommonImpl::template _S_load<_Tp, _S_full_size<_Tp>,
1484  _Np * sizeof(_Tp)>(__mem);
1485  else if constexpr (__bytes_to_load <= __max_load_size)
1486  return __convert<_SimdMember<_Tp>>(
1487  _CommonImpl::template _S_load<_Up, _Np>(__mem));
1488  else if constexpr (__bytes_to_load % __max_load_size == 0)
1489  {
1490  constexpr size_t __n_loads = __bytes_to_load / __max_load_size;
1491  constexpr size_t __elements_per_load = _Np / __n_loads;
1492  return __call_with_n_evaluations<__n_loads>(
1493  [](auto... __uncvted) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1494  return __convert<_SimdMember<_Tp>>(__uncvted...);
1495  }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1496  return _CommonImpl::template _S_load<_Up, __elements_per_load>(
1497  __mem + __i * __elements_per_load);
1498  });
1499  }
1500  else if constexpr (__bytes_to_load % (__max_load_size / 2) == 0
1501  && __max_load_size > 16)
1502  { // e.g. int[] -> <char, 12> with AVX2
1503  constexpr size_t __n_loads
1504  = __bytes_to_load / (__max_load_size / 2);
1505  constexpr size_t __elements_per_load = _Np / __n_loads;
1506  return __call_with_n_evaluations<__n_loads>(
1507  [](auto... __uncvted) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1508  return __convert<_SimdMember<_Tp>>(__uncvted...);
1509  }, [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1510  return _CommonImpl::template _S_load<_Up, __elements_per_load>(
1511  __mem + __i * __elements_per_load);
1512  });
1513  }
1514  else // e.g. int[] -> <char, 9>
1515  return __call_with_subscripts(
1516  __mem, make_index_sequence<_Np>(),
1517  [](auto... __args) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1518  return __vector_type_t<_Tp, _S_full_size<_Tp>>{static_cast<_Tp>(__args)...};
1519  });
1520  }
1521 
1522  // _S_masked_load {{{2
1523  template <typename _Tp, size_t _Np, typename _Up>
1524  static constexpr inline _SimdWrapper<_Tp, _Np>
1525  _S_masked_load(_SimdWrapper<_Tp, _Np> __merge, _MaskMember<_Tp> __k,
1526  const _Up* __mem) noexcept
1527  {
1528  _BitOps::_S_bit_iteration(_MaskImpl::_S_to_bits(__k),
1529  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1530  __merge._M_set(__i, static_cast<_Tp>(__mem[__i]));
1531  });
1532  return __merge;
1533  }
1534 
1535  // _S_store {{{2
1536  template <typename _Tp, typename _Up>
1537  _GLIBCXX_SIMD_INTRINSIC static constexpr void
1538  _S_store(_SimdMember<_Tp> __v, _Up* __mem, _TypeTag<_Tp>) noexcept
1539  {
1540  // TODO: converting int -> "smaller int" can be optimized with AVX512
1541  constexpr size_t _Np = _S_size<_Tp>;
1542  constexpr size_t __max_store_size
1543  = _SuperImpl::template _S_max_store_size<_Up>;
1544  if (__builtin_is_constant_evaluated())
1545  {
1546  for (size_t __i = 0; __i < _Np; ++__i)
1547  __mem[__i] = __v[__i];
1548  }
1549  else if constexpr (sizeof(_Up) > 8)
1550  __execute_n_times<_Np>([&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1551  __mem[__i] = __v[__i];
1552  });
1553  else if constexpr (is_same_v<_Up, _Tp>)
1554  _CommonImpl::_S_store(__v, __mem);
1555  else if constexpr (sizeof(_Up) * _Np <= __max_store_size)
1556  _CommonImpl::_S_store(_SimdWrapper<_Up, _Np>(__convert<_Up>(__v)),
1557  __mem);
1558  else
1559  {
1560  constexpr size_t __vsize = __max_store_size / sizeof(_Up);
1561  // round up to convert the last partial vector as well:
1562  constexpr size_t __stores = __div_roundup(_Np, __vsize);
1563  constexpr size_t __full_stores = _Np / __vsize;
1564  using _V = __vector_type_t<_Up, __vsize>;
1565  const array<_V, __stores> __converted
1566  = __convert_all<_V, __stores>(__v);
1567  __execute_n_times<__full_stores>(
1568  [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1569  _CommonImpl::_S_store(__converted[__i], __mem + __i * __vsize);
1570  });
1571  if constexpr (__full_stores < __stores)
1572  _CommonImpl::template _S_store<(_Np - __full_stores * __vsize)
1573  * sizeof(_Up)>(
1574  __converted[__full_stores], __mem + __full_stores * __vsize);
1575  }
1576  }
1577 
1578  // _S_masked_store_nocvt {{{2
1579  template <typename _Tp, size_t _Np>
1580  _GLIBCXX_SIMD_INTRINSIC static constexpr void
1581  _S_masked_store_nocvt(_SimdWrapper<_Tp, _Np> __v, _Tp* __mem, _MaskMember<_Tp> __k)
1582  {
1583  _BitOps::_S_bit_iteration(
1584  _MaskImpl::_S_to_bits(__k),
1585  [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1586  __mem[__i] = __v[__i];
1587  });
1588  }
1589 
1590  // _S_masked_store {{{2
1591  template <typename _TW, typename _TVT = _VectorTraits<_TW>,
1592  typename _Tp = typename _TVT::value_type, typename _Up>
1593  static constexpr inline void
1594  _S_masked_store(const _TW __v, _Up* __mem, const _MaskMember<_Tp> __k) noexcept
1595  {
1596  constexpr size_t _TV_size = _S_size<_Tp>;
1597  [[maybe_unused]] const auto __vi = __to_intrin(__v);
1598  constexpr size_t __max_store_size
1599  = _SuperImpl::template _S_max_store_size<_Up>;
1600  if constexpr (
1601  is_same_v<
1602  _Tp,
1603  _Up> || (is_integral_v<_Tp> && is_integral_v<_Up> && sizeof(_Tp) == sizeof(_Up)))
1604  {
1605  // bitwise or no conversion, reinterpret:
1606  const _MaskMember<_Up> __kk = [&]() _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1607  if constexpr (__is_bitmask_v<decltype(__k)>)
1608  return _MaskMember<_Up>(__k._M_data);
1609  else
1610  return __wrapper_bitcast<__int_for_sizeof_t<_Up>>(__k);
1611  }();
1612  _SuperImpl::_S_masked_store_nocvt(__wrapper_bitcast<_Up>(__v),
1613  __mem, __kk);
1614  }
1615  else if constexpr (__vectorized_sizeof<_Up>() > sizeof(_Up)
1616  && !_CommonImpl::
1617  template __converts_via_decomposition_v<
1618  _Tp, _Up, __max_store_size>)
1619  { // conversion via decomposition is better handled via the
1620  // bit_iteration
1621  // fallback below
1622  constexpr size_t _UW_size
1623  = std::min(_TV_size, __max_store_size / sizeof(_Up));
1624  static_assert(_UW_size <= _TV_size);
1625  using _UW = _SimdWrapper<_Up, _UW_size>;
1626  using _UV = __vector_type_t<_Up, _UW_size>;
1627  using _UAbi = simd_abi::deduce_t<_Up, _UW_size>;
1628  if constexpr (_UW_size == _TV_size) // one convert+store
1629  {
1630  const _UW __converted = __convert<_UW>(__v);
1631  _SuperImpl::_S_masked_store_nocvt(
1632  __converted, __mem,
1633  _UAbi::_MaskImpl::template _S_convert<
1634  __int_for_sizeof_t<_Up>>(__k));
1635  }
1636  else
1637  {
1638  static_assert(_UW_size * sizeof(_Up) == __max_store_size);
1639  constexpr size_t _NFullStores = _TV_size / _UW_size;
1640  constexpr size_t _NAllStores
1641  = __div_roundup(_TV_size, _UW_size);
1642  constexpr size_t _NParts = _S_full_size<_Tp> / _UW_size;
1643  const array<_UV, _NAllStores> __converted
1644  = __convert_all<_UV, _NAllStores>(__v);
1645  __execute_n_times<_NFullStores>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1646  _SuperImpl::_S_masked_store_nocvt(
1647  _UW(__converted[__i]), __mem + __i * _UW_size,
1648  _UAbi::_MaskImpl::template _S_convert<
1649  __int_for_sizeof_t<_Up>>(
1650  __extract_part<__i, _NParts>(__k.__as_full_vector())));
1651  });
1652  if constexpr (_NAllStores
1653  > _NFullStores) // one partial at the end
1654  _SuperImpl::_S_masked_store_nocvt(
1655  _UW(__converted[_NFullStores]),
1656  __mem + _NFullStores * _UW_size,
1657  _UAbi::_MaskImpl::template _S_convert<
1658  __int_for_sizeof_t<_Up>>(
1659  __extract_part<_NFullStores, _NParts>(
1660  __k.__as_full_vector())));
1661  }
1662  }
1663  else
1664  _BitOps::_S_bit_iteration(_MaskImpl::_S_to_bits(__k),
1665  [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
1666  __mem[__i] = static_cast<_Up>(__v[__i]);
1667  });
1668  }
1669 
1670  // _S_complement {{{2
1671  template <typename _Tp, size_t _Np>
1672  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1673  _S_complement(_SimdWrapper<_Tp, _Np> __x) noexcept
1674  {
1675  if constexpr (is_floating_point_v<_Tp>)
1676  return __vector_bitcast<_Tp>(~__vector_bitcast<__int_for_sizeof_t<_Tp>>(__x));
1677  else
1678  return ~__x._M_data;
1679  }
1680 
1681  // _S_unary_minus {{{2
1682  template <typename _Tp, size_t _Np>
1683  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1684  _S_unary_minus(_SimdWrapper<_Tp, _Np> __x) noexcept
1685  {
1686  // GCC doesn't use the psign instructions, but pxor & psub seem to be
1687  // just as good a choice as pcmpeqd & psign. So meh.
1688  return -__x._M_data;
1689  }
1690 
1691  // arithmetic operators {{{2
1692  template <typename _Tp, size_t _Np>
1693  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1694  _S_plus(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1695  { return __x._M_data + __y._M_data; }
1696 
1697  template <typename _Tp, size_t _Np>
1698  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1699  _S_minus(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1700  { return __x._M_data - __y._M_data; }
1701 
1702  template <typename _Tp, size_t _Np>
1703  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1704  _S_multiplies(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1705  { return __x._M_data * __y._M_data; }
1706 
1707  template <typename _Tp, size_t _Np>
1708  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1709  _S_divides(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1710  {
1711  // Note that division by 0 is always UB, so we must ensure we avoid the
1712  // case for partial registers
1713  if constexpr (!_Abi::template _S_is_partial<_Tp>)
1714  return __x._M_data / __y._M_data;
1715  else
1716  return __x._M_data / _Abi::__make_padding_nonzero(__y._M_data);
1717  }
1718 
1719  template <typename _Tp, size_t _Np>
1720  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1721  _S_modulus(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1722  {
1723  if constexpr (!_Abi::template _S_is_partial<_Tp>)
1724  return __x._M_data % __y._M_data;
1725  else
1726  return __as_vector(__x)
1727  % _Abi::__make_padding_nonzero(__as_vector(__y));
1728  }
1729 
1730  template <typename _Tp, size_t _Np>
1731  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1732  _S_bit_and(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1733  { return __and(__x, __y); }
1734 
1735  template <typename _Tp, size_t _Np>
1736  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1737  _S_bit_or(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1738  { return __or(__x, __y); }
1739 
1740  template <typename _Tp, size_t _Np>
1741  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1742  _S_bit_xor(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1743  { return __xor(__x, __y); }
1744 
1745  template <typename _Tp, size_t _Np>
1746  _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
1747  _S_bit_shift_left(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1748  { return __x._M_data << __y._M_data; }
1749 
1750  template <typename _Tp, size_t _Np>
1751  _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
1752  _S_bit_shift_right(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1753  { return __x._M_data >> __y._M_data; }
1754 
1755  template <typename _Tp, size_t _Np>
1756  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1757  _S_bit_shift_left(_SimdWrapper<_Tp, _Np> __x, int __y)
1758  { return __x._M_data << __y; }
1759 
1760  template <typename _Tp, size_t _Np>
1761  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
1762  _S_bit_shift_right(_SimdWrapper<_Tp, _Np> __x, int __y)
1763  { return __x._M_data >> __y; }
1764 
1765  // compares {{{2
1766  // _S_equal_to {{{3
1767  template <typename _Tp, size_t _Np>
1768  _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
1769  _S_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1770  { return __x._M_data == __y._M_data; }
1771 
1772  // _S_not_equal_to {{{3
1773  template <typename _Tp, size_t _Np>
1774  _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
1775  _S_not_equal_to(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1776  { return __x._M_data != __y._M_data; }
1777 
1778  // _S_less {{{3
1779  template <typename _Tp, size_t _Np>
1780  _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
1781  _S_less(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1782  { return __x._M_data < __y._M_data; }
1783 
1784  // _S_less_equal {{{3
1785  template <typename _Tp, size_t _Np>
1786  _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
1787  _S_less_equal(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
1788  { return __x._M_data <= __y._M_data; }
1789 
1790  // _S_negate {{{2
1791  template <typename _Tp, size_t _Np>
1792  _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
1793  _S_negate(_SimdWrapper<_Tp, _Np> __x) noexcept
1794  { return !__x._M_data; }
1795 
1796  // _S_min, _S_max, _S_minmax {{{2
1797  template <typename _Tp, size_t _Np>
1798  _GLIBCXX_SIMD_NORMAL_MATH _GLIBCXX_SIMD_INTRINSIC static constexpr
1799  _SimdWrapper<_Tp, _Np>
1800  _S_min(_SimdWrapper<_Tp, _Np> __a, _SimdWrapper<_Tp, _Np> __b)
1801  { return __a._M_data < __b._M_data ? __a._M_data : __b._M_data; }
1802 
1803  template <typename _Tp, size_t _Np>
1804  _GLIBCXX_SIMD_NORMAL_MATH _GLIBCXX_SIMD_INTRINSIC static constexpr
1805  _SimdWrapper<_Tp, _Np>
1806  _S_max(_SimdWrapper<_Tp, _Np> __a, _SimdWrapper<_Tp, _Np> __b)
1807  { return __a._M_data > __b._M_data ? __a._M_data : __b._M_data; }
1808 
1809  template <typename _Tp, size_t _Np>
1810  _GLIBCXX_SIMD_NORMAL_MATH _GLIBCXX_SIMD_INTRINSIC static constexpr
1811  pair<_SimdWrapper<_Tp, _Np>, _SimdWrapper<_Tp, _Np>>
1812  _S_minmax(_SimdWrapper<_Tp, _Np> __a, _SimdWrapper<_Tp, _Np> __b)
1813  {
1814  return {__a._M_data < __b._M_data ? __a._M_data : __b._M_data,
1815  __a._M_data < __b._M_data ? __b._M_data : __a._M_data};
1816  }
1817 
1818  // reductions {{{2
1819  template <size_t _Np, size_t... _Is, size_t... _Zeros, typename _Tp,
1820  typename _BinaryOperation>
1821  _GLIBCXX_SIMD_INTRINSIC static constexpr _Tp
1822  _S_reduce_partial(index_sequence<_Is...>, index_sequence<_Zeros...>,
1823  simd<_Tp, _Abi> __x, _BinaryOperation&& __binary_op)
1824  {
1825  using _V = __vector_type_t<_Tp, _Np / 2>;
1826  static_assert(sizeof(_V) <= sizeof(__x));
1827  // _S_full_size is the size of the smallest native SIMD register that
1828  // can store _Np/2 elements:
1829  using _FullSimd = __deduced_simd<_Tp, _VectorTraits<_V>::_S_full_size>;
1830  using _HalfSimd = __deduced_simd<_Tp, _Np / 2>;
1831  const auto __xx = __as_vector(__x);
1832  return _HalfSimd::abi_type::_SimdImpl::_S_reduce(
1833  static_cast<_HalfSimd>(__as_vector(__binary_op(
1834  static_cast<_FullSimd>(__intrin_bitcast<_V>(__xx)),
1835  static_cast<_FullSimd>(__intrin_bitcast<_V>(
1836  __vector_permute<(_Np / 2 + _Is)..., (int(_Zeros * 0) - 1)...>(
1837  __xx)))))),
1838  __binary_op);
1839  }
1840 
1841  template <typename _Tp, typename _BinaryOperation>
1842  _GLIBCXX_SIMD_INTRINSIC static constexpr _Tp
1843  _S_reduce(simd<_Tp, _Abi> __x, _BinaryOperation&& __binary_op)
1844  {
1845  constexpr size_t _Np = simd_size_v<_Tp, _Abi>;
1846  if constexpr (_Np == 1)
1847  return __x[0];
1848  else if constexpr (_Np == 2)
1849  return __binary_op(simd<_Tp, simd_abi::scalar>(__x[0]),
1850  simd<_Tp, simd_abi::scalar>(__x[1]))[0];
1851  else if (__builtin_is_constant_evaluated())
1852  {
1853  simd<_Tp, simd_abi::scalar> __acc = __x[0];
1854  for (size_t __i = 1; __i < _Np; ++__i)
1855  __acc = __binary_op(__acc, simd<_Tp, simd_abi::scalar>(__x[__i]));
1856  return __acc[0];
1857  }
1858  else if constexpr (_Abi::template _S_is_partial<_Tp>) //{{{
1859  {
1860  [[maybe_unused]] constexpr auto __full_size
1861  = _Abi::template _S_full_size<_Tp>;
1862  if constexpr (_Np == 3)
1863  return __binary_op(
1864  __binary_op(simd<_Tp, simd_abi::scalar>(__x[0]),
1865  simd<_Tp, simd_abi::scalar>(__x[1])),
1866  simd<_Tp, simd_abi::scalar>(__x[2]))[0];
1867  else if constexpr (is_same_v<__remove_cvref_t<_BinaryOperation>,
1868  plus<>>)
1869  {
1870  using _Ap = simd_abi::deduce_t<_Tp, __full_size>;
1871  return _Ap::_SimdImpl::_S_reduce(
1872  simd<_Tp, _Ap>(__private_init,
1873  _Abi::_S_masked(__as_vector(__x))),
1874  __binary_op);
1875  }
1876  else if constexpr (is_same_v<__remove_cvref_t<_BinaryOperation>,
1877  multiplies<>>)
1878  {
1879  using _Ap = simd_abi::deduce_t<_Tp, __full_size>;
1880  using _TW = _SimdWrapper<_Tp, __full_size>;
1881  _GLIBCXX_SIMD_USE_CONSTEXPR auto __implicit_mask_full
1882  = _Abi::template _S_implicit_mask<_Tp>().__as_full_vector();
1883  _GLIBCXX_SIMD_USE_CONSTEXPR _TW __one
1884  = __vector_broadcast<__full_size>(_Tp(1));
1885  const _TW __x_full = __data(__x).__as_full_vector();
1886  const _TW __x_padded_with_ones
1887  = _Ap::_CommonImpl::_S_blend(__implicit_mask_full, __one,
1888  __x_full);
1889  return _Ap::_SimdImpl::_S_reduce(
1890  simd<_Tp, _Ap>(__private_init, __x_padded_with_ones),
1891  __binary_op);
1892  }
1893  else if constexpr (_Np & 1)
1894  {
1895  using _Ap = simd_abi::deduce_t<_Tp, _Np - 1>;
1896  return __binary_op(
1897  simd<_Tp, simd_abi::scalar>(_Ap::_SimdImpl::_S_reduce(
1898  simd<_Tp, _Ap>(
1899  __intrin_bitcast<__vector_type_t<_Tp, _Np - 1>>(
1900  __as_vector(__x))),
1901  __binary_op)),
1902  simd<_Tp, simd_abi::scalar>(__x[_Np - 1]))[0];
1903  }
1904  else
1905  return _S_reduce_partial<_Np>(
1906  make_index_sequence<_Np / 2>(),
1907  make_index_sequence<__full_size - _Np / 2>(), __x, __binary_op);
1908  } //}}}
1909  else if constexpr (sizeof(__x) == 16) //{{{
1910  {
1911  if constexpr (_Np == 16)
1912  {
1913  const auto __y = __data(__x);
1914  __x = __binary_op(
1915  _M_make_simd<_Tp, _Np>(
1916  __vector_permute<0, 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6,
1917  7, 7>(__y)),
1918  _M_make_simd<_Tp, _Np>(
1919  __vector_permute<8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 13, 13,
1920  14, 14, 15, 15>(__y)));
1921  }
1922  if constexpr (_Np >= 8)
1923  {
1924  const auto __y = __vector_bitcast<short>(__data(__x));
1925  __x = __binary_op(
1926  _M_make_simd<_Tp, _Np>(__vector_bitcast<_Tp>(
1927  __vector_permute<0, 0, 1, 1, 2, 2, 3, 3>(__y))),
1928  _M_make_simd<_Tp, _Np>(__vector_bitcast<_Tp>(
1929  __vector_permute<4, 4, 5, 5, 6, 6, 7, 7>(__y))));
1930  }
1931  if constexpr (_Np >= 4)
1932  {
1933  using _Up = conditional_t<is_floating_point_v<_Tp>, float, int>;
1934  const auto __y = __vector_bitcast<_Up>(__data(__x));
1935  __x = __binary_op(__x,
1936  _M_make_simd<_Tp, _Np>(__vector_bitcast<_Tp>(
1937  __vector_permute<3, 2, 1, 0>(__y))));
1938  }
1939  using _Up = conditional_t<is_floating_point_v<_Tp>, double, _LLong>;
1940  const auto __y = __vector_bitcast<_Up>(__data(__x));
1941  __x = __binary_op(__x, _M_make_simd<_Tp, _Np>(__vector_bitcast<_Tp>(
1942  __vector_permute<1, 1>(__y))));
1943  return __x[0];
1944  } //}}}
1945  else
1946  {
1947  static_assert(sizeof(__x) > __min_vector_size<_Tp>);
1948  static_assert((_Np & (_Np - 1)) == 0); // _Np must be a power of 2
1949  using _Ap = simd_abi::deduce_t<_Tp, _Np / 2>;
1950  using _V = simd<_Tp, _Ap>;
1951  return _Ap::_SimdImpl::_S_reduce(
1952  __binary_op(_V(__private_init, __extract<0, 2>(__as_vector(__x))),
1953  _V(__private_init,
1954  __extract<1, 2>(__as_vector(__x)))),
1955  static_cast<_BinaryOperation&&>(__binary_op));
1956  }
1957  }
1958 
1959  // math {{{2
1960  // frexp, modf and copysign implemented in simd_math.h
1961 #define _GLIBCXX_SIMD_MATH_FALLBACK(__name) \
1962  template <typename _Tp, typename... _More> \
1963  static _Tp \
1964  _S_##__name(const _Tp& __x, const _More&... __more) \
1965  { \
1966  return __generate_vector<_Tp>( \
1967  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \
1968  return __name(__x[__i], __more[__i]...); \
1969  }); \
1970  }
1971 
1972 #define _GLIBCXX_SIMD_MATH_FALLBACK_MASKRET(__name) \
1973  template <typename _Tp, typename... _More> \
1974  static typename _Tp::mask_type \
1975  _S_##__name(const _Tp& __x, const _More&... __more) \
1976  { \
1977  return __generate_vector<_Tp>( \
1978  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \
1979  return __name(__x[__i], __more[__i]...); \
1980  }); \
1981  }
1982 
1983 #define _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(_RetTp, __name) \
1984  template <typename _Tp, typename... _More> \
1985  static auto \
1986  _S_##__name(const _Tp& __x, const _More&... __more) \
1987  { \
1988  return __fixed_size_storage_t<_RetTp, \
1989  _VectorTraits<_Tp>::_S_partial_width>:: \
1990  _S_generate([&](auto __meta) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \
1991  return __meta._S_generator( \
1992  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA { \
1993  return __name(__x[__meta._S_offset + __i], \
1994  __more[__meta._S_offset + __i]...); \
1995  }, \
1996  static_cast<_RetTp*>(nullptr)); \
1997  }); \
1998  }
1999 
2000  _GLIBCXX_SIMD_MATH_FALLBACK(acos)
2001  _GLIBCXX_SIMD_MATH_FALLBACK(asin)
2002  _GLIBCXX_SIMD_MATH_FALLBACK(atan)
2003  _GLIBCXX_SIMD_MATH_FALLBACK(atan2)
2004  _GLIBCXX_SIMD_MATH_FALLBACK(cos)
2005  _GLIBCXX_SIMD_MATH_FALLBACK(sin)
2006  _GLIBCXX_SIMD_MATH_FALLBACK(tan)
2007  _GLIBCXX_SIMD_MATH_FALLBACK(acosh)
2008  _GLIBCXX_SIMD_MATH_FALLBACK(asinh)
2009  _GLIBCXX_SIMD_MATH_FALLBACK(atanh)
2010  _GLIBCXX_SIMD_MATH_FALLBACK(cosh)
2011  _GLIBCXX_SIMD_MATH_FALLBACK(sinh)
2012  _GLIBCXX_SIMD_MATH_FALLBACK(tanh)
2013  _GLIBCXX_SIMD_MATH_FALLBACK(exp)
2014  _GLIBCXX_SIMD_MATH_FALLBACK(exp2)
2015  _GLIBCXX_SIMD_MATH_FALLBACK(expm1)
2016  _GLIBCXX_SIMD_MATH_FALLBACK(ldexp)
2017  _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(int, ilogb)
2018  _GLIBCXX_SIMD_MATH_FALLBACK(log)
2019  _GLIBCXX_SIMD_MATH_FALLBACK(log10)
2020  _GLIBCXX_SIMD_MATH_FALLBACK(log1p)
2021  _GLIBCXX_SIMD_MATH_FALLBACK(log2)
2022  _GLIBCXX_SIMD_MATH_FALLBACK(logb)
2023 
2024  // modf implemented in simd_math.h
2025  _GLIBCXX_SIMD_MATH_FALLBACK(scalbn)
2026  _GLIBCXX_SIMD_MATH_FALLBACK(scalbln)
2027  _GLIBCXX_SIMD_MATH_FALLBACK(cbrt)
2028  _GLIBCXX_SIMD_MATH_FALLBACK(fabs)
2029  _GLIBCXX_SIMD_MATH_FALLBACK(pow)
2030  _GLIBCXX_SIMD_MATH_FALLBACK(sqrt)
2031  _GLIBCXX_SIMD_MATH_FALLBACK(erf)
2032  _GLIBCXX_SIMD_MATH_FALLBACK(erfc)
2033  _GLIBCXX_SIMD_MATH_FALLBACK(lgamma)
2034  _GLIBCXX_SIMD_MATH_FALLBACK(tgamma)
2035 
2036  _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long, lrint)
2037  _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long long, llrint)
2038 
2039  _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long, lround)
2040  _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET(long long, llround)
2041 
2042  _GLIBCXX_SIMD_MATH_FALLBACK(fmod)
2043  _GLIBCXX_SIMD_MATH_FALLBACK(remainder)
2044 
2045  template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
2046  static _Tp
2047  _S_remquo(const _Tp __x, const _Tp __y,
2048  __fixed_size_storage_t<int, _TVT::_S_partial_width>* __z)
2049  {
2050  return __generate_vector<_Tp>([&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2051  int __tmp;
2052  auto __r = remquo(__x[__i], __y[__i], &__tmp);
2053  __z->_M_set(__i, __tmp);
2054  return __r;
2055  });
2056  }
2057 
2058  // copysign in simd_math.h
2059  _GLIBCXX_SIMD_MATH_FALLBACK(nextafter)
2060  _GLIBCXX_SIMD_MATH_FALLBACK(fdim)
2061  _GLIBCXX_SIMD_MATH_FALLBACK(fmax)
2062  _GLIBCXX_SIMD_MATH_FALLBACK(fmin)
2063  _GLIBCXX_SIMD_MATH_FALLBACK(fma)
2064 
2065  template <typename _Tp, size_t _Np>
2066  static constexpr _MaskMember<_Tp>
2067  _S_isgreater(_SimdWrapper<_Tp, _Np> __x,
2068  _SimdWrapper<_Tp, _Np> __y) noexcept
2069  {
2070  using _Ip = __int_for_sizeof_t<_Tp>;
2071  const auto __xn = __vector_bitcast<_Ip>(__x);
2072  const auto __yn = __vector_bitcast<_Ip>(__y);
2073  const auto __xp = __xn < 0 ? -(__xn & __finite_max_v<_Ip>) : __xn;
2074  const auto __yp = __yn < 0 ? -(__yn & __finite_max_v<_Ip>) : __yn;
2075  return __andnot(_SuperImpl::_S_isunordered(__x, __y)._M_data,
2076  __xp > __yp);
2077  }
2078 
2079  template <typename _Tp, size_t _Np>
2080  static constexpr _MaskMember<_Tp>
2081  _S_isgreaterequal(_SimdWrapper<_Tp, _Np> __x,
2082  _SimdWrapper<_Tp, _Np> __y) noexcept
2083  {
2084  using _Ip = __int_for_sizeof_t<_Tp>;
2085  const auto __xn = __vector_bitcast<_Ip>(__x);
2086  const auto __yn = __vector_bitcast<_Ip>(__y);
2087  const auto __xp = __xn < 0 ? -(__xn & __finite_max_v<_Ip>) : __xn;
2088  const auto __yp = __yn < 0 ? -(__yn & __finite_max_v<_Ip>) : __yn;
2089  return __andnot(_SuperImpl::_S_isunordered(__x, __y)._M_data,
2090  __xp >= __yp);
2091  }
2092 
2093  template <typename _Tp, size_t _Np>
2094  static constexpr _MaskMember<_Tp>
2095  _S_isless(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y) noexcept
2096  {
2097  using _Ip = __int_for_sizeof_t<_Tp>;
2098  const auto __xn = __vector_bitcast<_Ip>(__x);
2099  const auto __yn = __vector_bitcast<_Ip>(__y);
2100  const auto __xp = __xn < 0 ? -(__xn & __finite_max_v<_Ip>) : __xn;
2101  const auto __yp = __yn < 0 ? -(__yn & __finite_max_v<_Ip>) : __yn;
2102  return __andnot(_SuperImpl::_S_isunordered(__x, __y)._M_data,
2103  __xp < __yp);
2104  }
2105 
2106  template <typename _Tp, size_t _Np>
2107  static constexpr _MaskMember<_Tp>
2108  _S_islessequal(_SimdWrapper<_Tp, _Np> __x,
2109  _SimdWrapper<_Tp, _Np> __y) noexcept
2110  {
2111  using _Ip = __int_for_sizeof_t<_Tp>;
2112  const auto __xn = __vector_bitcast<_Ip>(__x);
2113  const auto __yn = __vector_bitcast<_Ip>(__y);
2114  const auto __xp = __xn < 0 ? -(__xn & __finite_max_v<_Ip>) : __xn;
2115  const auto __yp = __yn < 0 ? -(__yn & __finite_max_v<_Ip>) : __yn;
2116  return __andnot(_SuperImpl::_S_isunordered(__x, __y)._M_data,
2117  __xp <= __yp);
2118  }
2119 
2120  template <typename _Tp, size_t _Np>
2121  static constexpr _MaskMember<_Tp>
2122  _S_islessgreater(_SimdWrapper<_Tp, _Np> __x,
2123  _SimdWrapper<_Tp, _Np> __y) noexcept
2124  {
2125  return __andnot(_SuperImpl::_S_isunordered(__x, __y),
2126  _SuperImpl::_S_not_equal_to(__x, __y));
2127  }
2128 
2129 #undef _GLIBCXX_SIMD_MATH_FALLBACK
2130 #undef _GLIBCXX_SIMD_MATH_FALLBACK_MASKRET
2131 #undef _GLIBCXX_SIMD_MATH_FALLBACK_FIXEDRET
2132  // _S_abs {{{3
2133  template <typename _Tp, size_t _Np>
2134  _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2135  _S_abs(_SimdWrapper<_Tp, _Np> __x) noexcept
2136  {
2137  // if (__builtin_is_constant_evaluated())
2138  // {
2139  // return __x._M_data < 0 ? -__x._M_data : __x._M_data;
2140  // }
2141  if constexpr (is_floating_point_v<_Tp>)
2142  // `v < 0 ? -v : v` cannot compile to the efficient implementation of
2143  // masking the signbit off because it must consider v == -0
2144 
2145  // ~(-0.) & v would be easy, but breaks with fno-signed-zeros
2146  return __and(_S_absmask<__vector_type_t<_Tp, _Np>>, __x._M_data);
2147  else
2148  return __x._M_data < 0 ? -__x._M_data : __x._M_data;
2149  }
2150 
2151  // }}}3
2152  // _S_plus_minus {{{
2153  // Returns __x + __y - __y without -fassociative-math optimizing to __x.
2154  // - _TV must be __vector_type_t<floating-point type, N>.
2155  // - _UV must be _TV or floating-point type.
2156  template <typename _TV, typename _UV>
2157  _GLIBCXX_SIMD_INTRINSIC static constexpr _TV
2158  _S_plus_minus(_TV __x, _UV __y) noexcept
2159  {
2160 #if defined __i386__ && !defined __SSE_MATH__
2161  if constexpr (sizeof(__x) == 8)
2162  { // operations on __x would use the FPU
2163  static_assert(is_same_v<_TV, __vector_type_t<float, 2>>);
2164  const auto __x4 = __vector_bitcast<float, 4>(__x);
2165  if constexpr (is_same_v<_TV, _UV>)
2166  return __vector_bitcast<float, 2>(
2167  _S_plus_minus(__x4, __vector_bitcast<float, 4>(__y)));
2168  else
2169  return __vector_bitcast<float, 2>(_S_plus_minus(__x4, __y));
2170  }
2171 #endif
2172 #if !defined __clang__ && __GCC_IEC_559 == 0
2173  if (__builtin_is_constant_evaluated()
2174  || (__builtin_constant_p(__x) && __builtin_constant_p(__y)))
2175  return (__x + __y) - __y;
2176  else
2177  return [&] {
2178  __x += __y;
2179  if constexpr(__have_sse)
2180  {
2181  if constexpr (sizeof(__x) >= 16)
2182  asm("" : "+x"(__x));
2183  else if constexpr (is_same_v<__vector_type_t<float, 2>, _TV>)
2184  asm("" : "+x"(__x[0]), "+x"(__x[1]));
2185  else
2186  __assert_unreachable<_TV>();
2187  }
2188  else if constexpr(__have_neon)
2189  asm("" : "+w"(__x));
2190  else if constexpr (__have_power_vmx)
2191  {
2192  if constexpr (is_same_v<__vector_type_t<float, 2>, _TV>)
2193  asm("" : "+fgr"(__x[0]), "+fgr"(__x[1]));
2194  else
2195  asm("" : "+v"(__x));
2196  }
2197  else
2198  asm("" : "+g"(__x));
2199  return __x - __y;
2200  }();
2201 #else
2202  return (__x + __y) - __y;
2203 #endif
2204  }
2205 
2206  // }}}
2207  // _S_nearbyint {{{3
2208  template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
2209  _GLIBCXX_SIMD_INTRINSIC static _Tp
2210  _S_nearbyint(_Tp __x_) noexcept
2211  {
2212  using value_type = typename _TVT::value_type;
2213  using _V = typename _TVT::type;
2214  const _V __x = __x_;
2215  const _V __absx = __and(__x, _S_absmask<_V>);
2216  static_assert(__CHAR_BIT__ * sizeof(1ull) >= __digits_v<value_type>);
2217  _GLIBCXX_SIMD_USE_CONSTEXPR _V __shifter_abs
2218  = _V() + (1ull << (__digits_v<value_type> - 1));
2219  const _V __shifter = __or(__and(_S_signmask<_V>, __x), __shifter_abs);
2220  const _V __shifted = _S_plus_minus(__x, __shifter);
2221  return __absx < __shifter_abs ? __shifted : __x;
2222  }
2223 
2224  // _S_rint {{{3
2225  template <typename _Tp, typename _TVT = _VectorTraits<_Tp>>
2226  _GLIBCXX_SIMD_INTRINSIC static _Tp
2227  _S_rint(_Tp __x) noexcept
2228  { return _SuperImpl::_S_nearbyint(__x); }
2229 
2230  // _S_trunc {{{3
2231  template <typename _Tp, size_t _Np>
2232  _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2233  _S_trunc(_SimdWrapper<_Tp, _Np> __x)
2234  {
2235  using _V = __vector_type_t<_Tp, _Np>;
2236  const _V __absx = __and(__x._M_data, _S_absmask<_V>);
2237  static_assert(__CHAR_BIT__ * sizeof(1ull) >= __digits_v<_Tp>);
2238  constexpr _Tp __shifter = 1ull << (__digits_v<_Tp> - 1);
2239  _V __truncated = _S_plus_minus(__absx, __shifter);
2240  __truncated -= __truncated > __absx ? _V() + 1 : _V();
2241  return __absx < __shifter ? __or(__xor(__absx, __x._M_data), __truncated)
2242  : __x._M_data;
2243  }
2244 
2245  // _S_round {{{3
2246  template <typename _Tp, size_t _Np>
2247  _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2248  _S_round(_SimdWrapper<_Tp, _Np> __x)
2249  {
2250  const auto __abs_x = _SuperImpl::_S_abs(__x);
2251  const auto __t_abs = _SuperImpl::_S_trunc(__abs_x)._M_data;
2252  const auto __r_abs // round(abs(x)) =
2253  = __t_abs + (__abs_x._M_data - __t_abs >= _Tp(.5) ? _Tp(1) : 0);
2254  return __or(__xor(__abs_x._M_data, __x._M_data), __r_abs);
2255  }
2256 
2257  // _S_floor {{{3
2258  template <typename _Tp, size_t _Np>
2259  _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2260  _S_floor(_SimdWrapper<_Tp, _Np> __x)
2261  {
2262  const auto __y = _SuperImpl::_S_trunc(__x)._M_data;
2263  const auto __negative_input
2264  = __vector_bitcast<_Tp>(__x._M_data < __vector_broadcast<_Np, _Tp>(0));
2265  const auto __mask
2266  = __andnot(__vector_bitcast<_Tp>(__y == __x._M_data), __negative_input);
2267  return __or(__andnot(__mask, __y),
2268  __and(__mask, __y - __vector_broadcast<_Np, _Tp>(1)));
2269  }
2270 
2271  // _S_ceil {{{3
2272  template <typename _Tp, size_t _Np>
2273  _GLIBCXX_SIMD_INTRINSIC static _SimdWrapper<_Tp, _Np>
2274  _S_ceil(_SimdWrapper<_Tp, _Np> __x)
2275  {
2276  const auto __y = _SuperImpl::_S_trunc(__x)._M_data;
2277  const auto __negative_input
2278  = __vector_bitcast<_Tp>(__x._M_data < __vector_broadcast<_Np, _Tp>(0));
2279  const auto __inv_mask
2280  = __or(__vector_bitcast<_Tp>(__y == __x._M_data), __negative_input);
2281  return __or(__and(__inv_mask, __y),
2282  __andnot(__inv_mask, __y + __vector_broadcast<_Np, _Tp>(1)));
2283  }
2284 
2285  // _S_isnan {{{3
2286  template <typename _Tp, size_t _Np>
2287  _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2288  _S_isnan([[maybe_unused]] _SimdWrapper<_Tp, _Np> __x)
2289  {
2290 #if __FINITE_MATH_ONLY__
2291  return {}; // false
2292 #elif !defined __SUPPORT_SNAN__
2293  return ~(__x._M_data == __x._M_data);
2294 #elif defined __STDC_IEC_559__
2295  using _Ip = __int_for_sizeof_t<_Tp>;
2296  const auto __absn = __vector_bitcast<_Ip>(_SuperImpl::_S_abs(__x));
2297  const auto __infn
2298  = __vector_bitcast<_Ip>(__vector_broadcast<_Np>(__infinity_v<_Tp>));
2299  return __infn < __absn;
2300 #else
2301 #error "Not implemented: how to support SNaN but non-IEC559 floating-point?"
2302 #endif
2303  }
2304 
2305  // _S_isfinite {{{3
2306  template <typename _Tp, size_t _Np>
2307  _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2308  _S_isfinite([[maybe_unused]] _SimdWrapper<_Tp, _Np> __x)
2309  {
2310 #if __FINITE_MATH_ONLY__
2311  using _UV = typename _MaskMember<_Tp>::_BuiltinType;
2312  _GLIBCXX_SIMD_USE_CONSTEXPR _UV __alltrue = ~_UV();
2313  return __alltrue;
2314 #else
2315  // if all exponent bits are set, __x is either inf or NaN
2316  using _Ip = __int_for_sizeof_t<_Tp>;
2317  const auto __absn = __vector_bitcast<_Ip>(_SuperImpl::_S_abs(__x));
2318  const auto __maxn
2319  = __vector_bitcast<_Ip>(__vector_broadcast<_Np>(__finite_max_v<_Tp>));
2320  return __absn <= __maxn;
2321 #endif
2322  }
2323 
2324  // _S_isunordered {{{3
2325  template <typename _Tp, size_t _Np>
2326  _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2327  _S_isunordered(_SimdWrapper<_Tp, _Np> __x, _SimdWrapper<_Tp, _Np> __y)
2328  { return __or(_S_isnan(__x), _S_isnan(__y)); }
2329 
2330  // _S_signbit {{{3
2331  template <typename _Tp, size_t _Np>
2332  _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2333  _S_signbit(_SimdWrapper<_Tp, _Np> __x)
2334  {
2335  using _Ip = __int_for_sizeof_t<_Tp>;
2336  return __vector_bitcast<_Ip>(__x) < 0;
2337  // Arithmetic right shift (SRA) would also work (instead of compare), but
2338  // 64-bit SRA isn't available on x86 before AVX512. And in general,
2339  // compares are more likely to be efficient than SRA.
2340  }
2341 
2342  // _S_isinf {{{3
2343  template <typename _Tp, size_t _Np>
2344  _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2345  _S_isinf([[maybe_unused]] _SimdWrapper<_Tp, _Np> __x)
2346  {
2347 #if __FINITE_MATH_ONLY__
2348  return {}; // false
2349 #else
2350  return _SuperImpl::template _S_equal_to<_Tp, _Np>(_SuperImpl::_S_abs(__x),
2351  __vector_broadcast<_Np>(
2352  __infinity_v<_Tp>));
2353  // alternative:
2354  // compare to inf using the corresponding integer type
2355  /*
2356  return
2357  __vector_bitcast<_Tp>(__vector_bitcast<__int_for_sizeof_t<_Tp>>(
2358  _S_abs(__x)._M_data)
2359  ==
2360  __vector_bitcast<__int_for_sizeof_t<_Tp>>(__vector_broadcast<_Np>(
2361  __infinity_v<_Tp>)));
2362  */
2363 #endif
2364  }
2365 
2366  // _S_isnormal {{{3
2367  template <typename _Tp, size_t _Np>
2368  _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2369  _S_isnormal(_SimdWrapper<_Tp, _Np> __x)
2370  {
2371  using _Ip = __int_for_sizeof_t<_Tp>;
2372  const auto __absn = __vector_bitcast<_Ip>(_SuperImpl::_S_abs(__x));
2373  const auto __minn
2374  = __vector_bitcast<_Ip>(__vector_broadcast<_Np>(__norm_min_v<_Tp>));
2375 #if __FINITE_MATH_ONLY__
2376  return __absn >= __minn;
2377 #else
2378  const auto __maxn
2379  = __vector_bitcast<_Ip>(__vector_broadcast<_Np>(__finite_max_v<_Tp>));
2380  return __minn <= __absn && __absn <= __maxn;
2381 #endif
2382  }
2383 
2384  // _S_fpclassify {{{3
2385  template <typename _Tp, size_t _Np>
2386  _GLIBCXX_SIMD_INTRINSIC static __fixed_size_storage_t<int, _Np>
2387  _S_fpclassify(_SimdWrapper<_Tp, _Np> __x)
2388  {
2389  using _I = __int_for_sizeof_t<_Tp>;
2390  const auto __xn
2391  = __vector_bitcast<_I>(__to_intrin(_SuperImpl::_S_abs(__x)));
2392  constexpr size_t _NI = sizeof(__xn) / sizeof(_I);
2393  _GLIBCXX_SIMD_USE_CONSTEXPR auto __minn
2394  = __vector_bitcast<_I>(__vector_broadcast<_NI>(__norm_min_v<_Tp>));
2395 
2396  _GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_normal
2397  = __vector_broadcast<_NI, _I>(FP_NORMAL);
2398 #if !__FINITE_MATH_ONLY__
2399  _GLIBCXX_SIMD_USE_CONSTEXPR auto __infn
2400  = __vector_bitcast<_I>(__vector_broadcast<_NI>(__infinity_v<_Tp>));
2401  _GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_nan
2402  = __vector_broadcast<_NI, _I>(FP_NAN);
2403  _GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_infinite
2404  = __vector_broadcast<_NI, _I>(FP_INFINITE);
2405 #endif
2406 #ifndef __FAST_MATH__
2407  _GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_subnormal
2408  = __vector_broadcast<_NI, _I>(FP_SUBNORMAL);
2409 #endif
2410  _GLIBCXX_SIMD_USE_CONSTEXPR auto __fp_zero
2411  = __vector_broadcast<_NI, _I>(FP_ZERO);
2412 
2413  __vector_type_t<_I, _NI>
2414  __tmp = __xn < __minn
2415  #ifdef __FAST_MATH__
2416  ? __fp_zero
2417  #else
2418  ? (__xn == 0 ? __fp_zero : __fp_subnormal)
2419  #endif
2420  #if __FINITE_MATH_ONLY__
2421  : __fp_normal;
2422  #else
2423  : (__xn < __infn ? __fp_normal
2424  : (__xn == __infn ? __fp_infinite : __fp_nan));
2425  #endif
2426 
2427  if constexpr (sizeof(_I) == sizeof(int))
2428  {
2429  using _FixedInt = __fixed_size_storage_t<int, _Np>;
2430  const auto __as_int = __vector_bitcast<int, _Np>(__tmp);
2431  if constexpr (_FixedInt::_S_tuple_size == 1)
2432  return {__as_int};
2433  else if constexpr (_FixedInt::_S_tuple_size == 2
2434  && is_same_v<
2435  typename _FixedInt::_SecondType::_FirstAbi,
2436  simd_abi::scalar>)
2437  return {__extract<0, 2>(__as_int), __as_int[_Np - 1]};
2438  else if constexpr (_FixedInt::_S_tuple_size == 2)
2439  return {__extract<0, 2>(__as_int),
2440  __auto_bitcast(__extract<1, 2>(__as_int))};
2441  else
2442  __assert_unreachable<_Tp>();
2443  }
2444  else if constexpr (_Np == 2 && sizeof(_I) == 8
2445  && __fixed_size_storage_t<int, _Np>::_S_tuple_size == 2)
2446  {
2447  const auto __aslong = __vector_bitcast<_LLong>(__tmp);
2448  return {int(__aslong[0]), {int(__aslong[1])}};
2449  }
2450 #if _GLIBCXX_SIMD_X86INTRIN
2451  else if constexpr (sizeof(_Tp) == 8 && sizeof(__tmp) == 32
2452  && __fixed_size_storage_t<int, _Np>::_S_tuple_size == 1)
2453  return {_mm_packs_epi32(__to_intrin(__lo128(__tmp)),
2454  __to_intrin(__hi128(__tmp)))};
2455  else if constexpr (sizeof(_Tp) == 8 && sizeof(__tmp) == 64
2456  && __fixed_size_storage_t<int, _Np>::_S_tuple_size == 1)
2457  return {_mm512_cvtepi64_epi32(__to_intrin(__tmp))};
2458 #endif // _GLIBCXX_SIMD_X86INTRIN
2459  else if constexpr (__fixed_size_storage_t<int, _Np>::_S_tuple_size == 1)
2460  return {__call_with_subscripts<_Np>(__vector_bitcast<_LLong>(__tmp),
2461  [](auto... __l) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2462  return __make_wrapper<int>(__l...);
2463  })};
2464  else
2465  __assert_unreachable<_Tp>();
2466  }
2467 
2468  // _S_increment & _S_decrement{{{2
2469  template <typename _Tp, size_t _Np>
2470  _GLIBCXX_SIMD_INTRINSIC static constexpr void
2471  _S_increment(_SimdWrapper<_Tp, _Np>& __x)
2472  { __x = __x._M_data + 1; }
2473 
2474  template <typename _Tp, size_t _Np>
2475  _GLIBCXX_SIMD_INTRINSIC static constexpr void
2476  _S_decrement(_SimdWrapper<_Tp, _Np>& __x)
2477  { __x = __x._M_data - 1; }
2478 
2479  // smart_reference access {{{2
2480  template <typename _Tp, size_t _Np, typename _Up>
2481  _GLIBCXX_SIMD_INTRINSIC static constexpr void
2482  _S_set(_SimdWrapper<_Tp, _Np>& __v, int __i, _Up&& __x) noexcept
2483  { __v._M_set(__i, static_cast<_Up&&>(__x)); }
2484 
2485  // _S_masked_assign{{{2
2486  template <typename _Tp, typename _K, size_t _Np>
2487  _GLIBCXX_SIMD_INTRINSIC static constexpr void
2488  _S_masked_assign(_SimdWrapper<_K, _Np> __k, _SimdWrapper<_Tp, _Np>& __lhs,
2489  __type_identity_t<_SimdWrapper<_Tp, _Np>> __rhs)
2490  {
2491  if (__k._M_is_constprop_none_of())
2492  return;
2493  else if (__k._M_is_constprop_all_of())
2494  __lhs = __rhs;
2495  else
2496  __lhs = _CommonImpl::_S_blend(__k, __lhs, __rhs);
2497  }
2498 
2499  template <typename _Tp, typename _K, size_t _Np>
2500  _GLIBCXX_SIMD_INTRINSIC static constexpr void
2501  _S_masked_assign(_SimdWrapper<_K, _Np> __k, _SimdWrapper<_Tp, _Np>& __lhs,
2502  __type_identity_t<_Tp> __rhs)
2503  {
2504  if (__k._M_is_constprop_none_of())
2505  return;
2506  else if (__k._M_is_constprop_all_of())
2507  __lhs = __vector_broadcast<_Np>(__rhs);
2508  else if (__builtin_constant_p(__rhs) && __rhs == 0)
2509  {
2510  if constexpr (!is_same_v<bool, _K>)
2511  // the __andnot optimization only makes sense if __k._M_data is a
2512  // vector register
2513  __lhs._M_data
2514  = __andnot(__vector_bitcast<_Tp>(__k), __lhs._M_data);
2515  else
2516  // for AVX512/__mmask, a _mm512_maskz_mov is best
2517  __lhs
2518  = _CommonImpl::_S_blend(__k, __lhs, _SimdWrapper<_Tp, _Np>());
2519  }
2520  else
2521  __lhs = _CommonImpl::_S_blend(__k, __lhs,
2522  _SimdWrapper<_Tp, _Np>(
2523  __vector_broadcast<_Np>(__rhs)));
2524  }
2525 
2526  // _S_masked_cassign {{{2
2527  template <typename _Op, typename _Tp, typename _K, size_t _Np>
2528  _GLIBCXX_SIMD_INTRINSIC static constexpr void
2529  _S_masked_cassign(const _SimdWrapper<_K, _Np> __k,
2530  _SimdWrapper<_Tp, _Np>& __lhs,
2531  const __type_identity_t<_SimdWrapper<_Tp, _Np>> __rhs,
2532  _Op __op)
2533  {
2534  if (__k._M_is_constprop_none_of())
2535  return;
2536  else if (__k._M_is_constprop_all_of())
2537  __lhs = __op(_SuperImpl{}, __lhs, __rhs);
2538  else
2539  __lhs = _CommonImpl::_S_blend(__k, __lhs,
2540  __op(_SuperImpl{}, __lhs, __rhs));
2541  }
2542 
2543  template <typename _Op, typename _Tp, typename _K, size_t _Np>
2544  _GLIBCXX_SIMD_INTRINSIC static constexpr void
2545  _S_masked_cassign(const _SimdWrapper<_K, _Np> __k,
2546  _SimdWrapper<_Tp, _Np>& __lhs,
2547  const __type_identity_t<_Tp> __rhs, _Op __op)
2548  { _S_masked_cassign(__k, __lhs, __vector_broadcast<_Np>(__rhs), __op); }
2549 
2550  // _S_masked_unary {{{2
2551  template <template <typename> class _Op, typename _Tp, typename _K,
2552  size_t _Np>
2553  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
2554  _S_masked_unary(const _SimdWrapper<_K, _Np> __k,
2555  const _SimdWrapper<_Tp, _Np> __v)
2556  {
2557  if (__k._M_is_constprop_none_of())
2558  return __v;
2559  auto __vv = _M_make_simd(__v);
2560  _Op<decltype(__vv)> __op;
2561  if (__k._M_is_constprop_all_of())
2562  return __data(__op(__vv));
2563  else if constexpr (is_same_v<_Op<void>, __increment<void>>)
2564  {
2565  static_assert(not std::is_same_v<_K, bool>);
2566  if constexpr (is_integral_v<_Tp>)
2567  // Take a shortcut knowing that __k is an integer vector with values -1 or 0.
2568  return __v._M_data - __vector_bitcast<_Tp>(__k._M_data);
2569  else if constexpr (not __have_avx2)
2570  return __v._M_data
2571  + __vector_bitcast<_Tp>(__k._M_data & __builtin_bit_cast(
2572  _K, _Tp(1)));
2573  // starting with AVX2 it is more efficient to blend after add
2574  }
2575  else if constexpr (is_same_v<_Op<void>, __decrement<void>>)
2576  {
2577  static_assert(not std::is_same_v<_K, bool>);
2578  if constexpr (is_integral_v<_Tp>)
2579  // Take a shortcut knowing that __k is an integer vector with values -1 or 0.
2580  return __v._M_data + __vector_bitcast<_Tp>(__k._M_data);
2581  else if constexpr (not __have_avx2)
2582  return __v._M_data
2583  - __vector_bitcast<_Tp>(__k._M_data & __builtin_bit_cast(
2584  _K, _Tp(1)));
2585  // starting with AVX2 it is more efficient to blend after sub
2586  }
2587  return _CommonImpl::_S_blend(__k, __v, __data(__op(__vv)));
2588  }
2589 
2590  //}}}2
2591  };
2592 
2593 // _MaskImplBuiltinMixin {{{1
2594 struct _MaskImplBuiltinMixin
2595 {
2596  template <typename _Tp>
2597  using _TypeTag = _Tp*;
2598 
2599  // _S_to_maskvector {{{
2600  template <typename _Up, size_t _ToN = 1>
2601  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN>
2602  _S_to_maskvector(bool __x)
2603  {
2604  static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>);
2605  return __x ? __vector_type_t<_Up, _ToN>{~_Up()}
2606  : __vector_type_t<_Up, _ToN>{};
2607  }
2608 
2609  template <typename _Up, size_t _UpN = 0, size_t _Np, bool _Sanitized,
2610  size_t _ToN = _UpN == 0 ? _Np : _UpN>
2611  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN>
2612  _S_to_maskvector(_BitMask<_Np, _Sanitized> __x)
2613  {
2614  static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>);
2615  return __generate_vector<__vector_type_t<_Up, _ToN>>(
2616  [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2617  if constexpr (__i < _Np)
2618  return __x[__i] ? ~_Up() : _Up();
2619  else
2620  return _Up();
2621  });
2622  }
2623 
2624  template <typename _Up, size_t _UpN = 0, typename _Tp, size_t _Np,
2625  size_t _ToN = _UpN == 0 ? _Np : _UpN>
2626  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Up, _ToN>
2627  _S_to_maskvector(_SimdWrapper<_Tp, _Np> __x)
2628  {
2629  static_assert(is_same_v<_Up, __int_for_sizeof_t<_Up>>);
2630  using _TW = _SimdWrapper<_Tp, _Np>;
2631  using _UW = _SimdWrapper<_Up, _ToN>;
2632  if constexpr (sizeof(_Up) == sizeof(_Tp) && sizeof(_TW) == sizeof(_UW))
2633  return __wrapper_bitcast<_Up, _ToN>(__x);
2634  else if constexpr (is_same_v<_Tp, bool>) // bits -> vector
2635  return _S_to_maskvector<_Up, _ToN>(_BitMask<_Np>(__x._M_data));
2636  else
2637  { // vector -> vector
2638  /*
2639  [[maybe_unused]] const auto __y = __vector_bitcast<_Up>(__x._M_data);
2640  if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 4 && sizeof(__y) ==
2641  16) return __vector_permute<1, 3, -1, -1>(__y); else if constexpr
2642  (sizeof(_Tp) == 4 && sizeof(_Up) == 2
2643  && sizeof(__y) == 16)
2644  return __vector_permute<1, 3, 5, 7, -1, -1, -1, -1>(__y);
2645  else if constexpr (sizeof(_Tp) == 8 && sizeof(_Up) == 2
2646  && sizeof(__y) == 16)
2647  return __vector_permute<3, 7, -1, -1, -1, -1, -1, -1>(__y);
2648  else if constexpr (sizeof(_Tp) == 2 && sizeof(_Up) == 1
2649  && sizeof(__y) == 16)
2650  return __vector_permute<1, 3, 5, 7, 9, 11, 13, 15, -1, -1, -1, -1,
2651  -1, -1, -1, -1>(__y); else if constexpr (sizeof(_Tp) == 4 &&
2652  sizeof(_Up) == 1
2653  && sizeof(__y) == 16)
2654  return __vector_permute<3, 7, 11, 15, -1, -1, -1, -1, -1, -1, -1,
2655  -1, -1, -1, -1, -1>(__y); else if constexpr (sizeof(_Tp) == 8 &&
2656  sizeof(_Up) == 1
2657  && sizeof(__y) == 16)
2658  return __vector_permute<7, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
2659  -1, -1, -1, -1, -1>(__y); else
2660  */
2661  {
2662  return __generate_vector<__vector_type_t<_Up, _ToN>>(
2663  [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2664  if constexpr (__i < _Np)
2665  return _Up(__x[__i.value]);
2666  else
2667  return _Up();
2668  });
2669  }
2670  }
2671  }
2672 
2673  // }}}
2674  // _S_to_bits {{{
2675  template <typename _Tp, size_t _Np>
2676  _GLIBCXX_SIMD_INTRINSIC static constexpr _SanitizedBitMask<_Np>
2677  _S_to_bits(_SimdWrapper<_Tp, _Np> __x)
2678  {
2679  static_assert(!is_same_v<_Tp, bool>);
2680  static_assert(_Np <= __CHAR_BIT__ * sizeof(_ULLong));
2681  using _Up = make_unsigned_t<__int_for_sizeof_t<_Tp>>;
2682  const auto __bools
2683  = __vector_bitcast<_Up>(__x) >> (sizeof(_Up) * __CHAR_BIT__ - 1);
2684  _ULLong __r = 0;
2685  __execute_n_times<_Np>(
2686  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2687  __r |= _ULLong(__bools[__i.value]) << __i;
2688  });
2689  return __r;
2690  }
2691 
2692  // }}}
2693 };
2694 
2695 // _MaskImplBuiltin {{{1
2696 template <typename _Abi, typename>
2697  struct _MaskImplBuiltin : _MaskImplBuiltinMixin
2698  {
2699  using _MaskImplBuiltinMixin::_S_to_bits;
2700  using _MaskImplBuiltinMixin::_S_to_maskvector;
2701 
2702  // member types {{{
2703  template <typename _Tp>
2704  using _SimdMember = typename _Abi::template __traits<_Tp>::_SimdMember;
2705 
2706  template <typename _Tp>
2707  using _MaskMember = typename _Abi::template _MaskMember<_Tp>;
2708 
2709  using _SuperImpl = typename _Abi::_MaskImpl;
2710  using _CommonImpl = typename _Abi::_CommonImpl;
2711 
2712  template <typename _Tp>
2713  static constexpr size_t _S_size = simd_size_v<_Tp, _Abi>;
2714 
2715  // }}}
2716  // _S_broadcast {{{
2717  template <typename _Tp>
2718  _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
2719  _S_broadcast(bool __x)
2720  { return __x ? _Abi::template _S_implicit_mask<_Tp>() : _MaskMember<_Tp>(); }
2721 
2722  // }}}
2723  // _S_load {{{
2724  template <typename _Tp>
2725  _GLIBCXX_SIMD_INTRINSIC static constexpr _MaskMember<_Tp>
2726  _S_load(const bool* __mem)
2727  {
2728  using _I = __int_for_sizeof_t<_Tp>;
2729  if (not __builtin_is_constant_evaluated())
2730  if constexpr (sizeof(_Tp) == sizeof(bool))
2731  {
2732  const auto __bools
2733  = _CommonImpl::template _S_load<_I, _S_size<_Tp>>(__mem);
2734  // bool is {0, 1}, everything else is UB
2735  return __bools > 0;
2736  }
2737  return __generate_vector<_I, _S_size<_Tp>>(
2738  [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2739  return __mem[__i] ? ~_I() : _I();
2740  });
2741  }
2742 
2743  // }}}
2744  // _S_convert {{{
2745  template <typename _Tp, size_t _Np, bool _Sanitized>
2746  _GLIBCXX_SIMD_INTRINSIC static constexpr auto
2747  _S_convert(_BitMask<_Np, _Sanitized> __x)
2748  {
2749  if constexpr (__is_builtin_bitmask_abi<_Abi>())
2750  return _SimdWrapper<bool, simd_size_v<_Tp, _Abi>>(__x._M_to_bits());
2751  else
2752  return _SuperImpl::template _S_to_maskvector<__int_for_sizeof_t<_Tp>,
2753  _S_size<_Tp>>(
2754  __x._M_sanitized());
2755  }
2756 
2757  template <typename _Tp, size_t _Np>
2758  _GLIBCXX_SIMD_INTRINSIC static constexpr auto
2759  _S_convert(_SimdWrapper<bool, _Np> __x)
2760  {
2761  if constexpr (__is_builtin_bitmask_abi<_Abi>())
2762  return _SimdWrapper<bool, simd_size_v<_Tp, _Abi>>(__x._M_data);
2763  else
2764  return _SuperImpl::template _S_to_maskvector<__int_for_sizeof_t<_Tp>,
2765  _S_size<_Tp>>(
2766  _BitMask<_Np>(__x._M_data)._M_sanitized());
2767  }
2768 
2769  template <typename _Tp, typename _Up, size_t _Np>
2770  _GLIBCXX_SIMD_INTRINSIC static constexpr auto
2771  _S_convert(_SimdWrapper<_Up, _Np> __x)
2772  {
2773  if constexpr (__is_builtin_bitmask_abi<_Abi>())
2774  return _SimdWrapper<bool, simd_size_v<_Tp, _Abi>>(
2775  _SuperImpl::_S_to_bits(__x));
2776  else
2777  return _SuperImpl::template _S_to_maskvector<__int_for_sizeof_t<_Tp>,
2778  _S_size<_Tp>>(__x);
2779  }
2780 
2781  template <typename _Tp, typename _Up, typename _UAbi>
2782  _GLIBCXX_SIMD_INTRINSIC static constexpr auto
2783  _S_convert(simd_mask<_Up, _UAbi> __x)
2784  {
2785  if constexpr (__is_builtin_bitmask_abi<_Abi>())
2786  {
2787  using _R = _SimdWrapper<bool, simd_size_v<_Tp, _Abi>>;
2788  if constexpr (__is_builtin_bitmask_abi<_UAbi>()) // bits -> bits
2789  return _R(__data(__x));
2790  else if constexpr (__is_scalar_abi<_UAbi>()) // bool -> bits
2791  return _R(__data(__x));
2792  else if constexpr (__is_fixed_size_abi_v<_UAbi>) // bitset -> bits
2793  return _R(__data(__x)._M_to_bits());
2794  else // vector -> bits
2795  return _R(_UAbi::_MaskImpl::_S_to_bits(__data(__x))._M_to_bits());
2796  }
2797  else
2798  return _SuperImpl::template _S_to_maskvector<__int_for_sizeof_t<_Tp>,
2799  _S_size<_Tp>>(
2800  __data(__x));
2801  }
2802 
2803  // }}}
2804  // _S_masked_load {{{2
2805  template <typename _Tp, size_t _Np>
2806  static inline _SimdWrapper<_Tp, _Np>
2807  _S_masked_load(_SimdWrapper<_Tp, _Np> __merge,
2808  _SimdWrapper<_Tp, _Np> __mask, const bool* __mem) noexcept
2809  {
2810  // AVX(2) has 32/64 bit maskload, but nothing at 8 bit granularity
2811  auto __tmp = __wrapper_bitcast<__int_for_sizeof_t<_Tp>>(__merge);
2812  _BitOps::_S_bit_iteration(_SuperImpl::_S_to_bits(__mask),
2813  [&](auto __i) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2814  __tmp._M_set(__i, -__mem[__i]);
2815  });
2816  __merge = __wrapper_bitcast<_Tp>(__tmp);
2817  return __merge;
2818  }
2819 
2820  // _S_store {{{2
2821  template <typename _Tp, size_t _Np>
2822  _GLIBCXX_SIMD_INTRINSIC static constexpr void
2823  _S_store(_SimdWrapper<_Tp, _Np> __v, bool* __mem) noexcept
2824  {
2825  __execute_n_times<_Np>([&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2826  __mem[__i] = __v[__i];
2827  });
2828  }
2829 
2830  // _S_masked_store {{{2
2831  template <typename _Tp, size_t _Np>
2832  static inline void
2833  _S_masked_store(const _SimdWrapper<_Tp, _Np> __v, bool* __mem,
2834  const _SimdWrapper<_Tp, _Np> __k) noexcept
2835  {
2836  _BitOps::_S_bit_iteration(_SuperImpl::_S_to_bits(__k),
2837  [&](auto __i) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2838  __mem[__i] = __v[__i];
2839  });
2840  }
2841 
2842  // _S_from_bitmask{{{2
2843  template <size_t _Np, typename _Tp>
2844  _GLIBCXX_SIMD_INTRINSIC static _MaskMember<_Tp>
2845  _S_from_bitmask(_SanitizedBitMask<_Np> __bits, _TypeTag<_Tp>)
2846  { return _SuperImpl::template _S_to_maskvector<_Tp, _S_size<_Tp>>(__bits); }
2847 
2848  // logical and bitwise operators {{{2
2849  template <typename _Tp, size_t _Np>
2850  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
2851  _S_logical_and(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
2852  { return __and(__x._M_data, __y._M_data); }
2853 
2854  template <typename _Tp, size_t _Np>
2855  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
2856  _S_logical_or(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
2857  { return __or(__x._M_data, __y._M_data); }
2858 
2859  template <typename _Tp, size_t _Np>
2860  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
2861  _S_bit_not(const _SimdWrapper<_Tp, _Np>& __x)
2862  {
2863  if constexpr (_Abi::template _S_is_partial<_Tp>)
2864  return __andnot(__x, __wrapper_bitcast<_Tp>(
2865  _Abi::template _S_implicit_mask<_Tp>()));
2866  else
2867  return __not(__x._M_data);
2868  }
2869 
2870  template <typename _Tp, size_t _Np>
2871  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
2872  _S_bit_and(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
2873  { return __and(__x._M_data, __y._M_data); }
2874 
2875  template <typename _Tp, size_t _Np>
2876  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
2877  _S_bit_or(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
2878  { return __or(__x._M_data, __y._M_data); }
2879 
2880  template <typename _Tp, size_t _Np>
2881  _GLIBCXX_SIMD_INTRINSIC static constexpr _SimdWrapper<_Tp, _Np>
2882  _S_bit_xor(const _SimdWrapper<_Tp, _Np>& __x, const _SimdWrapper<_Tp, _Np>& __y)
2883  { return __xor(__x._M_data, __y._M_data); }
2884 
2885  // smart_reference access {{{2
2886  template <typename _Tp, size_t _Np>
2887  static constexpr void
2888  _S_set(_SimdWrapper<_Tp, _Np>& __k, int __i, bool __x) noexcept
2889  {
2890  if constexpr (is_same_v<_Tp, bool>)
2891  __k._M_set(__i, __x);
2892  else
2893  {
2894  static_assert(is_same_v<_Tp, __int_for_sizeof_t<_Tp>>);
2895  if (__builtin_is_constant_evaluated())
2896  {
2897  __k = __generate_from_n_evaluations<_Np,
2898  __vector_type_t<_Tp, _Np>>(
2899  [&](auto __j) _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA {
2900  if (__i == static_cast<int>(__j))
2901  return _Tp(-__x);
2902  else
2903  return __k[+__j];
2904  });
2905  }
2906  else
2907  __k._M_data[__i] = -__x;
2908  }
2909  }
2910 
2911  // _S_masked_assign{{{2
2912  template <typename _Tp, size_t _Np>
2913  _GLIBCXX_SIMD_INTRINSIC static void
2914  _S_masked_assign(_SimdWrapper<_Tp, _Np> __k, _SimdWrapper<_Tp, _Np>& __lhs,
2915  __type_identity_t<_SimdWrapper<_Tp, _Np>> __rhs)
2916  { __lhs = _CommonImpl::_S_blend(__k, __lhs, __rhs); }
2917 
2918  template <typename _Tp, size_t _Np>
2919  _GLIBCXX_SIMD_INTRINSIC static void
2920  _S_masked_assign(_SimdWrapper<_Tp, _Np> __k, _SimdWrapper<_Tp, _Np>& __lhs, bool __rhs)
2921  {
2922  if (__builtin_constant_p(__rhs))
2923  {
2924  if (__rhs == false)
2925  __lhs = __andnot(__k, __lhs);
2926  else
2927  __lhs = __or(__k, __lhs);
2928  return;
2929  }
2930  __lhs = _CommonImpl::_S_blend(__k, __lhs,
2931  __data(simd_mask<_Tp, _Abi>(__rhs)));
2932  }
2933 
2934  //}}}2
2935  // _S_all_of {{{
2936  template <typename _Tp>
2937  _GLIBCXX_SIMD_INTRINSIC static bool
2938  _S_all_of(simd_mask<_Tp, _Abi> __k)
2939  {
2940  return __call_with_subscripts(
2941  __data(__k), make_index_sequence<_S_size<_Tp>>(),
2942  [](const auto... __ent) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
2943  { return (... && !(__ent == 0)); });
2944  }
2945 
2946  // }}}
2947  // _S_any_of {{{
2948  template <typename _Tp>
2949  _GLIBCXX_SIMD_INTRINSIC static bool
2950  _S_any_of(simd_mask<_Tp, _Abi> __k)
2951  {
2952  return __call_with_subscripts(
2953  __data(__k), make_index_sequence<_S_size<_Tp>>(),
2954  [](const auto... __ent) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
2955  { return (... || !(__ent == 0)); });
2956  }
2957 
2958  // }}}
2959  // _S_none_of {{{
2960  template <typename _Tp>
2961  _GLIBCXX_SIMD_INTRINSIC static bool
2962  _S_none_of(simd_mask<_Tp, _Abi> __k)
2963  {
2964  return __call_with_subscripts(
2965  __data(__k), make_index_sequence<_S_size<_Tp>>(),
2966  [](const auto... __ent) constexpr _GLIBCXX_SIMD_ALWAYS_INLINE_LAMBDA
2967  { return (... && (__ent == 0)); });
2968  }
2969 
2970  // }}}
2971  // _S_some_of {{{
2972  template <typename _Tp>
2973  _GLIBCXX_SIMD_INTRINSIC static bool
2974  _S_some_of(simd_mask<_Tp, _Abi> __k)
2975  {
2976  const int __n_true = _SuperImpl::_S_popcount(__k);
2977  return __n_true > 0 && __n_true < int(_S_size<_Tp>);
2978  }
2979 
2980  // }}}
2981  // _S_popcount {{{
2982  template <typename _Tp>
2983  _GLIBCXX_SIMD_INTRINSIC static int
2984  _S_popcount(simd_mask<_Tp, _Abi> __k)
2985  {
2986  using _I = __int_for_sizeof_t<_Tp>;
2987  if constexpr (is_default_constructible_v<simd<_I, _Abi>>)
2988  return -reduce(
2989  simd<_I, _Abi>(__private_init, __wrapper_bitcast<_I>(__data(__k))));
2990  else
2991  return -reduce(__bit_cast<rebind_simd_t<_I, simd<_Tp, _Abi>>>(
2992  simd<_Tp, _Abi>(__private_init, __data(__k))));
2993  }
2994 
2995  // }}}
2996  // _S_find_first_set {{{
2997  template <typename _Tp>
2998  _GLIBCXX_SIMD_INTRINSIC static int
2999  _S_find_first_set(simd_mask<_Tp, _Abi> __k)
3000  { return std::__countr_zero(_SuperImpl::_S_to_bits(__data(__k))._M_to_bits()); }
3001 
3002  // }}}
3003  // _S_find_last_set {{{
3004  template <typename _Tp>
3005  _GLIBCXX_SIMD_INTRINSIC static int
3006  _S_find_last_set(simd_mask<_Tp, _Abi> __k)
3007  { return std::__bit_width(_SuperImpl::_S_to_bits(__data(__k))._M_to_bits()) - 1; }
3008 
3009  // }}}
3010  };
3011 
3012 //}}}1
3013 _GLIBCXX_SIMD_END_NAMESPACE
3014 #endif // __cplusplus >= 201703L
3015 #endif // _GLIBCXX_EXPERIMENTAL_SIMD_ABIS_H_
3016 
3017 // vim: foldmethod=marker foldmarker={{{,}}} sw=2 noet ts=8 sts=2 tw=100
complex< _Tp > log10(const complex< _Tp > &)
Return complex base 10 logarithm of z.
Definition: complex:1090
complex< _Tp > sin(const complex< _Tp > &)
Return complex sine of z.
Definition: complex:1120
complex< _Tp > log(const complex< _Tp > &)
Return complex natural logarithm of z.
Definition: complex:1085
complex< _Tp > tan(const complex< _Tp > &)
Return complex tangent of z.
Definition: complex:1221
complex< _Tp > exp(const complex< _Tp > &)
Return complex base e exponential of z.
Definition: complex:1058
complex< _Tp > cosh(const complex< _Tp > &)
Return complex hyperbolic cosine of z.
Definition: complex:1032
complex< _Tp > tanh(const complex< _Tp > &)
Return complex hyperbolic tangent of z.
Definition: complex:1249
complex< _Tp > pow(const complex< _Tp > &, int)
Return x to the y'th power.
Definition: complex:1280
complex< _Tp > sinh(const complex< _Tp > &)
Return complex hyperbolic sine of z.
Definition: complex:1150
complex< _Tp > cos(const complex< _Tp > &)
Return complex cosine of z.
Definition: complex:1002
complex< _Tp > sqrt(const complex< _Tp > &)
Return complex square root of z.
Definition: complex:1194
integral_constant< bool, true > true_type
The type used as a compile-time boolean with true value.
Definition: type_traits:82
typename conditional< _Cond, _Iftrue, _Iffalse >::type conditional_t
Alias template for conditional.
Definition: type_traits:2614
constexpr const _Tp & min(const _Tp &, const _Tp &)
This does what you think it does.
Definition: stl_algobase.h:233
constexpr _Tp reduce(_InputIterator __first, _InputIterator __last, _Tp __init, _BinaryOperation __binary_op)
Calculate reduction of values in a range.
Definition: numeric:287
_Tp fabs(const std::complex< _Tp > &)
fabs(__z) [8.1.8].
Definition: complex:2411
std::complex< _Tp > asinh(const std::complex< _Tp > &)
asinh(__z) [8.1.6].
Definition: complex:2358
std::complex< _Tp > atan(const std::complex< _Tp > &)
atan(__z) [8.1.4].
Definition: complex:2283
make_integer_sequence< size_t, _Num > make_index_sequence
Alias template make_index_sequence.
Definition: utility.h:185
std::complex< _Tp > atanh(const std::complex< _Tp > &)
atanh(__z) [8.1.7].
Definition: complex:2402
std::complex< _Tp > acosh(const std::complex< _Tp > &)
acosh(__z) [8.1.5].
Definition: complex:2319
std::complex< _Tp > acos(const std::complex< _Tp > &)
acos(__z) [8.1.2].
Definition: complex:2203
std::complex< _Tp > asin(const std::complex< _Tp > &)
asin(__z) [8.1.3].
Definition: complex:2239