Open Broadcaster Software
Free, open source software for live streaming and recording
sse2.h
Go to the documentation of this file.
1 /* Permission is hereby granted, free of charge, to any person
2  * obtaining a copy of this software and associated documentation
3  * files (the "Software"), to deal in the Software without
4  * restriction, including without limitation the rights to use, copy,
5  * modify, merge, publish, distribute, sublicense, and/or sell copies
6  * of the Software, and to permit persons to whom the Software is
7  * furnished to do so, subject to the following conditions:
8  *
9  * The above copyright notice and this permission notice shall be
10  * included in all copies or substantial portions of the Software.
11  *
12  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
13  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
14  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
15  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
16  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
17  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
18  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19  * SOFTWARE.
20  *
21  * Copyright:
22  * 2017 Evan Nemerson <evan@nemerson.com>
23  * 2015-2017 John W. Ratcliff <jratcliffscarab@gmail.com>
24  * 2015 Brandon Rowlett <browlett@nvidia.com>
25  * 2015 Ken Fast <kfast@gdeb.com>
26  * 2017 Hasindu Gamaarachchi <hasindu@unsw.edu.au>
27  * 2018 Jeff Daily <jeff.daily@amd.com>
28  */
29 
30 #if !defined(SIMDE__SSE2_H)
31 #if !defined(SIMDE__SSE2_H)
32 #define SIMDE__SSE2_H
33 #endif
34 #include "sse.h"
35 
36 #if defined(SIMDE_SSE2_NATIVE)
37 #undef SIMDE_SSE2_NATIVE
38 #endif
39 #if defined(SIMDE_SSE2_FORCE_NATIVE)
40 #define SIMDE_SSE2_NATIVE
41 #elif defined(__SSE2__) && !defined(SIMDE_SSE2_NO_NATIVE) && \
42  !defined(SIMDE_NO_NATIVE)
43 #define SIMDE_SSE2_NATIVE
44 #elif defined(__ARM_NEON) && !defined(SIMDE_SSE2_NO_NEON) && \
45  !defined(SIMDE_NO_NEON)
46 #define SIMDE_SSE2_NEON
47 #endif
48 
49 #if defined(SIMDE_SSE2_NATIVE) && !defined(SIMDE_SSE_NATIVE)
50 #if defined(SIMDE_SSE2_FORCE_NATIVE)
51 #error Native SSE2 support requires native SSE support
52 #else
53 #warning Native SSE2 support requires native SSE support, disabling
54 #undef SIMDE_SSE2_NATIVE
55 #endif
56 #elif defined(SIMDE_SSE2_NEON) && !defined(SIMDE_SSE_NEON)
57 #warning SSE2 NEON support requires SSE NEON support, disabling
58 #undef SIMDE_SSE_NEON
59 #endif
60 
61 #if defined(SIMDE_SSE2_NATIVE)
62 #include <emmintrin.h>
63 #else
64 #if defined(SIMDE_SSE2_NEON)
65 #include <arm_neon.h>
66 #endif
67 #endif
68 
69 #include <stdint.h>
70 #include <limits.h>
71 #include <string.h>
72 
73 #define vreinterpretq_m128i_s32(v) \
74  (simde__m128i) { .neon_i32 = v }
75 #define vreinterpretq_m128i_u64(v) \
76  (simde__m128i) { .neon_u64 = v }
77 
78 #define vreinterpretq_s32_m128i(a) a.neon_i32
79 #define vreinterpretq_u64_m128i(a) a.neon_u64
80 
82 
83 typedef SIMDE_ALIGN(16) union {
84 #if defined(SIMDE__ENABLE_GCC_VEC_EXT)
85  int8_t i8 __attribute__((__vector_size__(16), __may_alias__));
86  int16_t i16 __attribute__((__vector_size__(16), __may_alias__));
87  int32_t i32 __attribute__((__vector_size__(16), __may_alias__));
88  int64_t i64 __attribute__((__vector_size__(16), __may_alias__));
89  uint8_t u8 __attribute__((__vector_size__(16), __may_alias__));
90  uint16_t u16 __attribute__((__vector_size__(16), __may_alias__));
91  uint32_t u32 __attribute__((__vector_size__(16), __may_alias__));
92  uint64_t u64 __attribute__((__vector_size__(16), __may_alias__));
93 #if defined(SIMDE__HAVE_INT128)
94  simde_int128 i128 __attribute__((__vector_size__(16), __may_alias__));
95  simde_uint128 u128 __attribute__((__vector_size__(16), __may_alias__));
96 #endif
97  simde_float32 f32 __attribute__((__vector_size__(16), __may_alias__));
98  simde_float64 f64 __attribute__((__vector_size__(16), __may_alias__));
99 #else
100  int8_t i8[16];
101  int16_t i16[8];
102  int32_t i32[4];
103  int64_t i64[2];
104  uint8_t u8[16];
105  uint16_t u16[8];
106  uint32_t u32[4];
107  uint64_t u64[2];
108 #if defined(SIMDE__HAVE_INT128)
109  simde_int128 i128[1];
110  simde_uint128 u128[1];
111 #endif
112  simde_float32 f32[4];
113  simde_float64 f64[2];
114 #endif
115 
116 #if defined(SIMDE_SSE2_NATIVE)
117  __m128i n;
118 #elif defined(SIMDE_SSE2_NEON)
119  int8x16_t neon_i8;
120  int16x8_t neon_i16;
121  int32x4_t neon_i32;
122  int64x2_t neon_i64;
123  uint8x16_t neon_u8;
124  uint16x8_t neon_u16;
125  uint32x4_t neon_u32;
126  uint64x2_t neon_u64;
127  float32x4_t neon_f32;
128 #if defined(SIMDE_ARCH_AMD64)
129  float64x2_t neon_f64;
130 #endif
131 #endif
133 
134 typedef SIMDE_ALIGN(16) union {
135 #if defined(SIMDE__ENABLE_GCC_VEC_EXT)
136  int8_t i8 __attribute__((__vector_size__(16), __may_alias__));
137  int16_t i16 __attribute__((__vector_size__(16), __may_alias__));
138  int32_t i32 __attribute__((__vector_size__(16), __may_alias__));
139  int64_t i64 __attribute__((__vector_size__(16), __may_alias__));
140  uint8_t u8 __attribute__((__vector_size__(16), __may_alias__));
141  uint16_t u16 __attribute__((__vector_size__(16), __may_alias__));
142  uint32_t u32 __attribute__((__vector_size__(16), __may_alias__));
143  uint64_t u64 __attribute__((__vector_size__(16), __may_alias__));
144  simde_float32 f32 __attribute__((__vector_size__(16), __may_alias__));
145  simde_float64 f64 __attribute__((__vector_size__(16), __may_alias__));
146 #else
147  int8_t i8[16];
148  int16_t i16[8];
149  int32_t i32[4];
150  int64_t i64[2];
151  uint8_t u8[16];
152  uint16_t u16[8];
153  uint32_t u32[4];
154  uint64_t u64[2];
155  simde_float32 f32[4];
156  simde_float64 f64[2];
157 #endif
158 
159 #if defined(SIMDE_SSE2_NATIVE)
160  __m128d n;
161 #elif defined(SIMDE_SSE2_NEON)
162  int8x16_t neon_i8;
163  int16x8_t neon_i16;
164  int32x4_t neon_i32;
165  int64x2_t neon_i64;
166  uint8x16_t neon_u8;
167  uint16x8_t neon_u16;
168  uint32x4_t neon_u32;
169  uint64x2_t neon_u64;
170  float32x4_t neon_f32;
171 #if defined(SIMDE_ARCH_AMD64)
172  float64x2_t neon_f64;
173 #endif
174 #endif
176 
177 #if defined(SIMDE_SSE2_NATIVE)
178 HEDLEY_STATIC_ASSERT(sizeof(__m128i) == sizeof(simde__m128i),
179  "__m128i size doesn't match simde__m128i size");
180 HEDLEY_STATIC_ASSERT(sizeof(__m128d) == sizeof(simde__m128d),
181  "__m128d size doesn't match simde__m128d size");
182 SIMDE__FUNCTION_ATTRIBUTES simde__m128i SIMDE__M128I_C(__m128i v)
183 {
184  simde__m128i r;
185  r.n = v;
186  return r;
187 }
188 SIMDE__FUNCTION_ATTRIBUTES simde__m128d SIMDE__M128D_C(__m128d v)
189 {
190  simde__m128d r;
191  r.n = v;
192  return r;
193 }
194 #elif defined(SIMDE_SSE_NEON)
195 #define SIMDE__M128I_NEON_C(T, expr) \
196  (simde__m128i) { .neon_##T = expr }
197 #define SIMDE__M128D_NEON_C(T, expr) \
198  (simde__m128d) { .neon_##T = expr }
199 #endif
200 HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128i), "simde__m128i size incorrect");
201 HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128d), "simde__m128d size incorrect");
202 
205 {
206 #if defined(SIMDE_SSE2_NATIVE)
207  return SIMDE__M128I_C(_mm_add_epi8(a.n, b.n));
208 #elif defined(SIMDE_SSE2_NEON)
209  return SIMDE__M128I_NEON_C(i8, vaddq_s8(a.neon_i8, b.neon_i8));
210 #else
211  simde__m128i r;
213  for (size_t i = 0; i < (sizeof(r.i8) / sizeof(r.i8[0])); i++) {
214  r.i8[i] = a.i8[i] + b.i8[i];
215  }
216  return r;
217 #endif
218 }
219 
222 {
223 #if defined(SIMDE_SSE2_NATIVE)
224  return SIMDE__M128I_C(_mm_add_epi16(a.n, b.n));
225 #elif defined(SIMDE_SSE2_NEON)
226  return SIMDE__M128I_NEON_C(i16, vaddq_s16(a.neon_i16, b.neon_i16));
227 #else
228  simde__m128i r;
230  for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
231  r.i16[i] = a.i16[i] + b.i16[i];
232  }
233  return r;
234 #endif
235 }
236 
239 {
240 #if defined(SIMDE_SSE2_NATIVE)
241  return SIMDE__M128I_C(_mm_add_epi32(a.n, b.n));
242 #elif defined(SIMDE_SSE2_NEON)
243  return SIMDE__M128I_NEON_C(i32, vaddq_s32(a.neon_i32, b.neon_i32));
244 #else
245  simde__m128i r;
247  for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
248  r.i32[i] = a.i32[i] + b.i32[i];
249  }
250  return r;
251 #endif
252 }
253 
256 {
257 #if defined(SIMDE_SSE2_NATIVE)
258  return SIMDE__M128I_C(_mm_add_epi64(a.n, b.n));
259 #elif defined(SIMDE_SSE2_NEON)
260  return SIMDE__M128I_NEON_C(i64, vaddq_s64(a.neon_i64, b.neon_i64));
261 #else
262  simde__m128i r;
264  for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
265  r.i64[i] = a.i64[i] + b.i64[i];
266  }
267  return r;
268 #endif
269 }
270 
273 {
274 #if defined(SIMDE_SSE2_NATIVE)
275  return SIMDE__M128D_C(_mm_add_pd(a.n, b.n));
276 #elif defined(SIMDE_SSE2_NEON) && defined(SIMDE_ARCH_AMD64)
277  return SIMDE__M128I_NEON_C(f64, vaddq_f64(a.neon_f64, b.neon_f64));
278 #else
279  simde__m128d r;
281  for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
282  r.f64[i] = a.f64[i] + b.f64[i];
283  }
284  return r;
285 #endif
286 }
287 
290 {
291 #if defined(SIMDE_SSE2_NATIVE)
292  return SIMDE__M128D_C(_mm_add_sd(a.n, b.n));
293 #else
294  simde__m128d r;
295  r.f64[0] = a.f64[0] + b.f64[0];
296  r.f64[1] = a.f64[1];
297  return r;
298 #endif
299 }
300 
303 {
304 #if defined(SIMDE_SSE2_NATIVE)
305  return SIMDE__M64_C(_mm_add_si64(a.n, b.n));
306 #elif defined(SIMDE_SSE2_NEON)
307  return SIMDE__M64_NEON_C(i64, vadd_s64(a.neon_i64, b.neon_i64));
308 #else
309  simde__m64 r;
310  r.i64[0] = a.i64[0] + b.i64[0];
311  return r;
312 #endif
313 }
314 
317 {
318 #if defined(SIMDE_SSE2_NATIVE)
319  return SIMDE__M128I_C(_mm_adds_epi8(a.n, b.n));
320 #elif defined(SIMDE_SSE2_NEON)
321  return SIMDE__M128I_NEON_C(i8, vqaddq_s8(a.neon_i8, b.neon_i8));
322 #else
323  simde__m128i r;
325  for (size_t i = 0; i < (sizeof(r.i8) / sizeof(r.i8[0])); i++) {
326  if ((((b.i8[i]) > 0) && ((a.i8[i]) > (INT8_MAX - (b.i8[i]))))) {
327  r.i8[i] = INT8_MAX;
328  } else if ((((b.i8[i]) < 0) &&
329  ((a.i8[i]) < (INT8_MIN - (b.i8[i]))))) {
330  r.i8[i] = INT8_MIN;
331  } else {
332  r.i8[i] = (a.i8[i]) + (b.i8[i]);
333  }
334  }
335  return r;
336 #endif
337 }
338 
341 {
342 #if defined(SIMDE_SSE2_NATIVE)
343  return SIMDE__M128I_C(_mm_adds_epi16(a.n, b.n));
344 #elif defined(SIMDE_SSE2_NEON)
345  return SIMDE__M128I_NEON_C(i16, vqaddq_s16(a.neon_i16, b.neon_i16));
346 #else
347  simde__m128i r;
349  for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
350  if ((((b.i16[i]) > 0) &&
351  ((a.i16[i]) > (INT16_MAX - (b.i16[i]))))) {
352  r.i16[i] = INT16_MAX;
353  } else if ((((b.i16[i]) < 0) &&
354  ((a.i16[i]) < (INT16_MIN - (b.i16[i]))))) {
355  r.i16[i] = INT16_MIN;
356  } else {
357  r.i16[i] = (a.i16[i]) + (b.i16[i]);
358  }
359  }
360  return r;
361 #endif
362 }
363 
366 {
367 #if defined(SIMDE_SSE2_NATIVE)
368  return SIMDE__M128I_C(_mm_adds_epu8(a.n, b.n));
369 #elif defined(SIMDE_SSE2_NEON)
370  return SIMDE__M128I_NEON_C(u8, vqaddq_u8(a.neon_u8, b.neon_u8));
371 #else
372  simde__m128i r;
374  for (size_t i = 0; i < (sizeof(r.u8) / sizeof(r.u8[0])); i++) {
375  r.u8[i] = ((UINT8_MAX - a.u8[i]) > b.u8[i])
376  ? (a.u8[i] + b.u8[i])
377  : UINT8_MAX;
378  }
379  return r;
380 #endif
381 }
382 
385 {
386 #if defined(SIMDE_SSE2_NATIVE)
387  return SIMDE__M128I_C(_mm_adds_epu16(a.n, b.n));
388 #elif defined(SIMDE_SSE2_NEON)
389  return SIMDE__M128I_NEON_C(u16, vqaddq_u16(a.neon_u16, b.neon_u16));
390 #else
391  simde__m128i r;
393  for (size_t i = 0; i < (sizeof(r.u16) / sizeof(r.u16[0])); i++) {
394  r.u16[i] = ((UINT16_MAX - a.u16[i]) > b.u16[i])
395  ? (a.u16[i] + b.u16[i])
396  : UINT16_MAX;
397  }
398  return r;
399 #endif
400 }
401 
404 {
405 #if defined(SIMDE_SSE2_NATIVE)
406  return SIMDE__M128D_C(_mm_and_pd(a.n, b.n));
407 #elif defined(SIMDE_SSE2_NEON)
408  return SIMDE__M128D_NEON_C(i32, vandq_s32(a.neon_i32, b.neon_i32));
409 #else
410  simde__m128d r;
412  for (size_t i = 0; i < (sizeof(r.u64) / sizeof(r.u64[0])); i++) {
413  r.u64[i] = a.u64[i] & b.u64[i];
414  }
415  return r;
416 #endif
417 }
418 
421 {
422 #if defined(SIMDE_SSE2_NATIVE)
423  return SIMDE__M128I_C(_mm_and_si128(a.n, b.n));
424 #elif defined(SIMDE_SSE_NEON)
425  return SIMDE__M128I_NEON_C(i32, vandq_s32(b.neon_i32, a.neon_i32));
426 #else
427  simde__m128i r;
429  for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
430  r.i64[i] = a.i64[i] & b.i64[i];
431  }
432  return r;
433 #endif
434 }
435 
438 {
439 #if defined(SIMDE_SSE2_NATIVE)
440  return SIMDE__M128D_C(_mm_andnot_pd(a.n, b.n));
441 #elif defined(SIMDE_SSE2_NEON)
442  return SIMDE__M128D_NEON_C(i32, vbicq_s32(a.neon_i32, b.neon_i32));
443 #else
444  simde__m128d r;
446  for (size_t i = 0; i < (sizeof(r.u64) / sizeof(r.u64[0])); i++) {
447  r.u64[i] = ~a.u64[i] & b.u64[i];
448  }
449  return r;
450 #endif
451 }
452 
455 {
456 #if defined(SIMDE_SSE2_NATIVE)
457  return SIMDE__M128I_C(_mm_andnot_si128(a.n, b.n));
458 #elif defined(SIMDE_SSE2_NEON)
459  return SIMDE__M128I_NEON_C(i32, vbicq_s32(b.neon_i32, a.neon_i32));
460 #else
461  simde__m128i r;
463  for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
464  r.i64[i] = ~(a.i64[i]) & b.i64[i];
465  }
466  return r;
467 #endif
468 }
469 
472 {
473 #if defined(SIMDE_SSE2_NATIVE)
474  return SIMDE__M128I_C(_mm_avg_epu8(a.n, b.n));
475 #elif defined(SIMDE_SSE2_NEON)
476  return SIMDE__M128I_NEON_C(u8, vrhaddq_u8(b.neon_u8, a.neon_u8));
477 #else
478  simde__m128i r;
480  for (size_t i = 0; i < (sizeof(r.u8) / sizeof(r.u8[0])); i++) {
481  r.u8[i] = (a.u8[i] + b.u8[i] + 1) >> 1;
482  }
483  return r;
484 #endif
485 }
486 
489 {
490 #if defined(SIMDE_SSE2_NATIVE)
491  return SIMDE__M128I_C(_mm_avg_epu16(a.n, b.n));
492 #elif defined(SIMDE_SSE2_NEON)
493  return SIMDE__M128I_NEON_C(u16, vrhaddq_u16(b.neon_u16, a.neon_u16));
494 #else
495  simde__m128i r;
497  for (size_t i = 0; i < (sizeof(r.u16) / sizeof(r.u16[0])); i++) {
498  r.u16[i] = (a.u16[i] + b.u16[i] + 1) >> 1;
499  }
500  return r;
501 #endif
502 }
503 
506 {
507  simde__m128i r;
508 
509  if (HEDLEY_UNLIKELY(imm8 > 15)) {
510  r.u64[0] = 0;
511  r.u64[1] = 0;
512  return r;
513  }
514 
515  const int s = imm8 * 8;
516 
517 #if defined(SIMDE__HAVE_INT128)
518  r.u128[0] = a.u128[0] << s;
519 #else
520  if (s < 64) {
521  r.u64[0] = (a.u64[0] << s);
522  r.u64[1] = (a.u64[1] << s) | (a.u64[0] >> (64 - s));
523  } else {
524  r.u64[0] = 0;
525  r.u64[1] = a.u64[0] << (s - 64);
526  }
527 #endif
528 
529  return r;
530 }
531 #if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
532 #define simde_mm_bslli_si128(a, imm8) SIMDE__M128I_C(_mm_slli_si128(a.n, imm8))
533 #elif defined(SIMDE_SSE2_NEON)
534 #define simde_mm_bslli_si128(a, imm8) \
535  SIMDE__M128I_NEON_C( \
536  i8, \
537  (((imm8) <= 0) ? ((a).neon_i8) \
538  : (((imm8) > 15) ? (vdupq_n_s8(0)) \
539  : (vextq_s8(vdupq_n_s8(0), \
540  (a).neon_i8, \
541  16 - (imm8))))))
542 #endif
543 #define simde_mm_slli_si128(a, imm8) simde_mm_bslli_si128(a, imm8)
544 
547 {
548  simde__m128i r;
549 
550  if (HEDLEY_UNLIKELY(imm8 > 15)) {
551  r.u64[0] = 0;
552  r.u64[1] = 0;
553  return r;
554  }
555 
556  const int s = imm8 * 8;
557 
558 #if defined(SIMDE__HAVE_INT128)
559  r.u128[0] = a.u128[0] >> s;
560 #else
561  if (s < 64) {
562  r.u64[0] = (a.u64[0] >> s) | (a.u64[1] << (64 - s));
563  r.u64[1] = (a.u64[1] >> s);
564  } else {
565  r.u64[0] = a.u64[1] >> (s - 64);
566  r.u64[1] = 0;
567  }
568 #endif
569 
570  return r;
571 }
572 #if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
573 #define simde_mm_bsrli_si128(a, imm8) SIMDE__M128I_C(_mm_srli_si128(a.n, imm8))
574 #elif defined(SIMDE_SSE2_NEON)
575 #define simde_mm_bsrli_si128(a, imm8) \
576  SIMDE__M128I_NEON_C( \
577  i8, \
578  ((imm8) <= 0) \
579  ? ((a).neon_i8) \
580  : (((imm8) > 15) ? (vdupq_n_s8(0)) \
581  : (vextq_s8((a).neon_i8, \
582  vdupq_n_s8(0), (imm8)))))
583 #endif
584 #define simde_mm_srli_si128(a, imm8) simde_mm_bsrli_si128(a, imm8)
585 
587 void simde_mm_clflush(void const *p)
588 {
589 #if defined(SIMDE_SSE2_NATIVE)
590  _mm_clflush(p);
591 #else
592  (void)p;
593 #endif
594 }
595 
598 {
599 #if defined(SIMDE_SSE2_NATIVE)
600  return _mm_comieq_sd(a.n, b.n);
601 #else
602  return a.f64[0] == b.f64[0];
603 #endif
604 }
605 
608 {
609 #if defined(SIMDE_SSE2_NATIVE)
610  return _mm_comige_sd(a.n, b.n);
611 #else
612  return a.f64[0] >= b.f64[0];
613 #endif
614 }
615 
618 {
619 #if defined(SIMDE_SSE2_NATIVE)
620  return _mm_comigt_sd(a.n, b.n);
621 #else
622  return a.f64[0] > b.f64[0];
623 #endif
624 }
625 
628 {
629 #if defined(SIMDE_SSE2_NATIVE)
630  return _mm_comile_sd(a.n, b.n);
631 #else
632  return a.f64[0] <= b.f64[0];
633 #endif
634 }
635 
638 {
639 #if defined(SIMDE_SSE2_NATIVE)
640  return _mm_comilt_sd(a.n, b.n);
641 #else
642  return a.f64[0] < b.f64[0];
643 #endif
644 }
645 
648 {
649 #if defined(SIMDE_SSE2_NATIVE)
650  return _mm_comineq_sd(a.n, b.n);
651 #else
652  return a.f64[0] != b.f64[0];
653 #endif
654 }
655 
658 {
659 #if defined(SIMDE_SSE2_NATIVE)
660  return SIMDE__M128_C(_mm_castpd_ps(a.n));
661 #else
662  union {
663  simde__m128d pd;
664  simde__m128 ps;
665  } r;
666  r.pd = a;
667  return r.ps;
668 #endif
669 }
670 
673 {
674 #if defined(SIMDE_SSE2_NATIVE)
675  return SIMDE__M128I_C(_mm_castpd_si128(a.n));
676 #else
677  union {
678  simde__m128d pd;
679  simde__m128i si128;
680  } r;
681  r.pd = a;
682  return r.si128;
683 #endif
684 }
685 
688 {
689 #if defined(SIMDE_SSE2_NATIVE)
690  return SIMDE__M128D_C(_mm_castps_pd(a.n));
691 #else
692  union {
693  simde__m128 ps;
694  simde__m128d pd;
695  } r;
696  r.ps = a;
697  return r.pd;
698 #endif
699 }
700 
703 {
704 #if defined(SIMDE_SSE2_NATIVE)
705  return SIMDE__M128I_C(_mm_castps_si128(a.n));
706 #elif defined(SIMDE_SSE2_NEON)
707  return SIMDE__M128I_NEON_C(i32, a.neon_i32);
708 #else
709  union {
710  simde__m128 ps;
711  simde__m128i si128;
712  } r;
713  r.ps = a;
714  return r.si128;
715 #endif
716 }
717 
720 {
721 #if defined(SIMDE_SSE2_NATIVE)
722  return SIMDE__M128D_C(_mm_castsi128_pd(a.n));
723 #else
724  union {
725  simde__m128i si128;
726  simde__m128d pd;
727  } r;
728  r.si128 = a;
729  return r.pd;
730 #endif
731 }
732 
735 {
736 #if defined(SIMDE_SSE2_NATIVE)
737  return SIMDE__M128_C(_mm_castsi128_ps(a.n));
738 #elif defined(SIMDE_SSE2_NEON)
739  return SIMDE__M128_NEON_C(f32, a.neon_f32);
740 #else
741  union {
742  simde__m128i si128;
743  simde__m128 ps;
744  } r;
745  r.si128 = a;
746  return r.ps;
747 #endif
748 }
749 
752 {
753 #if defined(SIMDE_SSE2_NATIVE)
754  return SIMDE__M128I_C(_mm_cmpeq_epi8(a.n, b.n));
755 #elif defined(SIMDE_SSE2_NEON)
756  return SIMDE__M128I_NEON_C(
757  i8, vreinterpretq_s8_u8(vceqq_s8(b.neon_i8, a.neon_i8)));
758 #else
759  simde__m128i r;
761  for (size_t i = 0; i < (sizeof(r.i8) / sizeof(r.i8[0])); i++) {
762  r.i8[i] = (a.i8[i] == b.i8[i]) ? 0xff : 0x00;
763  }
764  return r;
765 #endif
766 }
767 
770 {
771 #if defined(SIMDE_SSE2_NATIVE)
772  return SIMDE__M128I_C(_mm_cmpeq_epi16(a.n, b.n));
773 #elif defined(SIMDE_SSE2_NEON)
774  return SIMDE__M128I_NEON_C(
775  i16, vreinterpretq_s16_u16(vceqq_s16(b.neon_i16, a.neon_i16)));
776 #else
777  simde__m128i r;
779  for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
780  r.i16[i] = (a.i16[i] == b.i16[i]) ? 0xffff : 0x0000;
781  }
782  return r;
783 #endif
784 }
785 
788 {
789 #if defined(SIMDE_SSE2_NATIVE)
790  return SIMDE__M128I_C(_mm_cmpeq_epi32(a.n, b.n));
791 #elif defined(SIMDE_SSE2_NEON)
792  return SIMDE__M128I_NEON_C(
793  i32, vreinterpretq_s32_u32(vceqq_s32(b.neon_i32, a.neon_i32)));
794 #else
795  simde__m128i r;
797  for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
798  r.i32[i] = (a.i32[i] == b.i32[i]) ? 0xffffffff : 0x00000000;
799  }
800  return r;
801 #endif
802 }
803 
806 {
807 #if defined(SIMDE_SSE2_NATIVE)
808  return SIMDE__M128D_C(_mm_cmpeq_pd(a.n, b.n));
809 #elif defined(SIMDE_SSE2_NEON)
810  return SIMDE__M128D_NEON_C(
811  i32, vreinterpretq_s32_u32(
812  vceqq_s32(vreinterpretq_s32_f32(b.neon_f32),
813  vreinterpretq_s32_f32(a.neon_f32))));
814 #else
815  simde__m128d r;
817  for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
818  r.u64[i] = (a.f64[i] == b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
819  }
820  return r;
821 #endif
822 }
823 
826 {
827 #if defined(SIMDE_SSE2_NATIVE)
828  return SIMDE__M128D_C(_mm_cmpeq_sd(a.n, b.n));
829 #else
830  simde__m128d r;
831  r.u64[0] = (a.f64[0] == b.f64[0]) ? ~UINT64_C(0) : 0;
832  r.u64[1] = a.u64[1];
833  return r;
834 #endif
835 }
836 
839 {
840 #if defined(SIMDE_SSE2_NATIVE)
841  return SIMDE__M128D_C(_mm_cmpneq_pd(a.n, b.n));
842 #elif defined(SIMDE_SSE2_NEON)
843  return SIMDE__M128D_NEON_C(f32,
844  vreinterpretq_f32_u16(vmvnq_u16(
845  vceqq_s16(b.neon_i16, a.neon_i16))));
846 #else
847  simde__m128d r;
849  for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
850  r.u64[i] = (a.f64[i] != b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
851  }
852  return r;
853 #endif
854 }
855 
858 {
859 #if defined(SIMDE_SSE2_NATIVE)
860  return SIMDE__M128D_C(_mm_cmpneq_sd(a.n, b.n));
861 #else
862  simde__m128d r;
863  r.u64[0] = (a.f64[0] != b.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
864  r.u64[1] = a.u64[1];
865  return r;
866 #endif
867 }
868 
871 {
872 #if defined(SIMDE_SSE2_NATIVE)
873  return SIMDE__M128I_C(_mm_cmplt_epi8(a.n, b.n));
874 #elif defined(SIMDE_SSE2_NEON)
875  return SIMDE__M128I_NEON_C(
876  i8, vreinterpretq_s8_u8(vcltq_s8(a.neon_i8, b.neon_i8)));
877 #else
878  simde__m128i r;
880  for (size_t i = 0; i < (sizeof(r.i8) / sizeof(r.i8[0])); i++) {
881  r.i8[i] = (a.i8[i] < b.i8[i]) ? 0xff : 0x00;
882  }
883  return r;
884 #endif
885 }
886 
889 {
890 #if defined(SIMDE_SSE2_NATIVE)
891  return SIMDE__M128I_C(_mm_cmplt_epi16(a.n, b.n));
892 #elif defined(SIMDE_SSE2_NEON)
893  return SIMDE__M128I_NEON_C(
894  i16, vreinterpretq_s16_u16(vcltq_s16(a.neon_i16, b.neon_i16)));
895 #else
896  simde__m128i r;
898  for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
899  r.i16[i] = (a.i16[i] < b.i16[i]) ? 0xffff : 0x0000;
900  }
901  return r;
902 #endif
903 }
904 
907 {
908 #if defined(SIMDE_SSE2_NATIVE)
909  return SIMDE__M128I_C(_mm_cmplt_epi32(a.n, b.n));
910 #elif defined(SIMDE_SSE2_NEON)
911  return SIMDE__M128I_NEON_C(
912  i32, vreinterpretq_s32_u32(vcltq_s32(a.neon_i32, b.neon_i32)));
913 #else
914  simde__m128i r;
916  for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
917  r.i32[i] = (a.i32[i] < b.i32[i]) ? 0xffffffff : 0x00000000;
918  }
919  return r;
920 #endif
921 }
922 
925 {
926 #if defined(SIMDE_SSE2_NATIVE)
927  return SIMDE__M128D_C(_mm_cmplt_pd(a.n, b.n));
928 #else
929  simde__m128d r;
931  for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
932  r.u64[i] = (a.f64[i] < b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
933  }
934  return r;
935 #endif
936 }
937 
940 {
941 #if defined(SIMDE_SSE2_NATIVE)
942  return SIMDE__M128D_C(_mm_cmplt_sd(a.n, b.n));
943 #else
944  simde__m128d r;
945  r.u64[0] = (a.f64[0] < b.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
946  r.u64[1] = a.u64[1];
947  return r;
948 #endif
949 }
950 
953 {
954 #if defined(SIMDE_SSE2_NATIVE)
955  return SIMDE__M128D_C(_mm_cmple_pd(a.n, b.n));
956 #else
957  simde__m128d r;
959  for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
960  r.u64[i] = (a.f64[i] <= b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
961  }
962  return r;
963 #endif
964 }
965 
968 {
969 #if defined(SIMDE_SSE2_NATIVE)
970  return SIMDE__M128D_C(_mm_cmple_sd(a.n, b.n));
971 #else
972  simde__m128d r;
973  r.u64[0] = (a.f64[0] <= b.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
974  r.u64[1] = a.u64[1];
975  return r;
976 #endif
977 }
978 
981 {
982 #if defined(SIMDE_SSE2_NATIVE)
983  return SIMDE__M128I_C(_mm_cmpgt_epi8(a.n, b.n));
984 #elif defined(SIMDE_SSE2_NEON)
985  return SIMDE__M128I_NEON_C(
986  i8, vreinterpretq_s8_u8(vcgtq_s8(a.neon_i8, b.neon_i8)));
987 #else
988  simde__m128i r;
990  for (size_t i = 0; i < (sizeof(r.i8) / sizeof(r.i8[0])); i++) {
991  r.i8[i] = (a.i8[i] > b.i8[i]) ? 0xff : 0x00;
992  }
993  return r;
994 #endif
995 }
996 
999 {
1000 #if defined(SIMDE_SSE2_NATIVE)
1001  return SIMDE__M128I_C(_mm_cmpgt_epi16(a.n, b.n));
1002 #elif defined(SIMDE_SSE2_NEON)
1003  return SIMDE__M128I_NEON_C(
1004  i16, vreinterpretq_s16_u16(vcgtq_s16(a.neon_i16, b.neon_i16)));
1005 #else
1006  simde__m128i r;
1008  for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
1009  r.i16[i] = (a.i16[i] > b.i16[i]) ? 0xffff : 0x0000;
1010  }
1011  return r;
1012 #endif
1013 }
1014 
1017 {
1018 #if defined(SIMDE_SSE2_NATIVE)
1019  return SIMDE__M128I_C(_mm_cmpgt_epi32(a.n, b.n));
1020 #elif defined(SIMDE_SSE2_NEON)
1021  return SIMDE__M128I_NEON_C(
1022  i32, vreinterpretq_s32_u32(vcgtq_s32(a.neon_i32, b.neon_i32)));
1023 #else
1024  simde__m128i r;
1026  for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
1027  r.i32[i] = (a.i32[i] > b.i32[i]) ? 0xffffffff : 0x00000000;
1028  }
1029  return r;
1030 #endif
1031 }
1032 
1035 {
1036 #if defined(SIMDE_SSE2_NATIVE)
1037  return SIMDE__M128D_C(_mm_cmpgt_pd(a.n, b.n));
1038 #else
1039  simde__m128d r;
1041  for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
1042  r.u64[i] = (a.f64[i] > b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
1043  }
1044  return r;
1045 #endif
1046 }
1047 
1050 {
1051 #if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
1052  return SIMDE__M128D_C(_mm_cmpgt_sd(a.n, b.n));
1053 #else
1054  simde__m128d r;
1055  r.u64[0] = (a.f64[0] > b.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
1056  r.u64[1] = a.u64[1];
1057  return r;
1058 #endif
1059 }
1060 
1063 {
1064 #if defined(SIMDE_SSE2_NATIVE)
1065  return SIMDE__M128D_C(_mm_cmpge_pd(a.n, b.n));
1066 #else
1067  simde__m128d r;
1069  for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
1070  r.u64[i] = (a.f64[i] >= b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
1071  }
1072  return r;
1073 #endif
1074 }
1075 
1078 {
1079 #if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
1080  return SIMDE__M128D_C(_mm_cmpge_sd(a.n, b.n));
1081 #else
1082  simde__m128d r;
1083  r.u64[0] = (a.f64[0] >= b.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
1084  r.u64[1] = a.u64[1];
1085  return r;
1086 #endif
1087 }
1088 
1091 {
1092 #if defined(SIMDE_SSE2_NATIVE)
1093  return SIMDE__M128D_C(_mm_cmpnge_pd(a.n, b.n));
1094 #else
1095  return simde_mm_cmplt_pd(a, b);
1096 #endif
1097 }
1098 
1101 {
1102 #if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
1103  return SIMDE__M128D_C(_mm_cmpnge_sd(a.n, b.n));
1104 #else
1105  return simde_mm_cmplt_sd(a, b);
1106 #endif
1107 }
1108 
1111 {
1112 #if defined(SIMDE_SSE2_NATIVE)
1113  return SIMDE__M128D_C(_mm_cmpnlt_pd(a.n, b.n));
1114 #else
1115  return simde_mm_cmpge_pd(a, b);
1116 #endif
1117 }
1118 
1121 {
1122 #if defined(SIMDE_SSE2_NATIVE)
1123  return SIMDE__M128D_C(_mm_cmpnlt_sd(a.n, b.n));
1124 #else
1125  return simde_mm_cmpge_sd(a, b);
1126 #endif
1127 }
1128 
1131 {
1132 #if defined(SIMDE_SSE2_NATIVE)
1133  return SIMDE__M128D_C(_mm_cmpnle_pd(a.n, b.n));
1134 #else
1135  return simde_mm_cmpgt_pd(a, b);
1136 #endif
1137 }
1138 
1141 {
1142 #if defined(SIMDE_SSE2_NATIVE)
1143  return SIMDE__M128D_C(_mm_cmpnle_sd(a.n, b.n));
1144 #else
1145  return simde_mm_cmpgt_sd(a, b);
1146 #endif
1147 }
1148 
1151 {
1152 #if defined(SIMDE_SSE2_NATIVE)
1153  return SIMDE__M128D_C(_mm_cmpord_pd(a.n, b.n));
1154 #else
1155  simde__m128d r;
1157  for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
1158  r.u64[i] = (!isnan(a.f64[i]) && !isnan(b.f64[i])) ? ~UINT64_C(0)
1159  : UINT64_C(0);
1160  }
1161  return r;
1162 #endif
1163 }
1164 
1167 {
1168 #if defined(SIMDE_SSE2_NATIVE)
1169  return SIMDE__M128D_C(_mm_cmpord_sd(a.n, b.n));
1170 #else
1171  simde__m128d r;
1172  r.u64[0] = (!isnan(a.f64[0]) && !isnan(b.f64[0])) ? ~UINT64_C(0)
1173  : UINT64_C(0);
1174  r.u64[1] = a.u64[1];
1175  return r;
1176 #endif
1177 }
1178 
1181 {
1182 #if defined(SIMDE_SSE2_NATIVE)
1183  return SIMDE__M128D_C(_mm_cmpunord_pd(a.n, b.n));
1184 #else
1185  simde__m128d r;
1187  for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
1188  r.u64[i] = (isnan(a.f64[i]) || isnan(b.f64[i])) ? ~UINT64_C(0)
1189  : UINT64_C(0);
1190  }
1191  return r;
1192 #endif
1193 }
1194 
1197 {
1198 #if defined(SIMDE_SSE2_NATIVE)
1199  return SIMDE__M128D_C(_mm_cmpunord_sd(a.n, b.n));
1200 #else
1201  simde__m128d r;
1202  r.u64[0] = (isnan(a.f64[0]) || isnan(b.f64[0])) ? ~UINT64_C(0)
1203  : UINT64_C(0);
1204  r.u64[1] = a.u64[1];
1205  return r;
1206 #endif
1207 }
1208 
1211 {
1212 #if defined(SIMDE_SSE2_NATIVE)
1213  return SIMDE__M128D_C(_mm_cvtepi32_pd(a.n));
1214 #else
1215  simde__m128d r;
1217  for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
1218  r.f64[i] = (simde_float64)a.i32[i];
1219  }
1220  return r;
1221 #endif
1222 }
1223 
1226 {
1227 #if defined(SIMDE_SSE2_NATIVE)
1228  return SIMDE__M128_C(_mm_cvtepi32_ps(a.n));
1229 #elif defined(SIMDE_SSE2_NEON)
1230  return SIMDE__M128_NEON_C(f32, vcvtq_f32_s32(a.neon_i32));
1231 #else
1232  simde__m128 r;
1234  for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
1235  r.f32[i] = (simde_float32)a.i32[i];
1236  }
1237  return r;
1238 #endif
1239 }
1240 
1243 {
1244 #if defined(SIMDE_SSE2_NATIVE)
1245  return SIMDE__M128I_C(_mm_cvtpd_epi32(a.n));
1246 #else
1247  simde__m128i r;
1249  for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
1250  r.i32[i] = (int32_t)a.f64[i];
1251  }
1252  return r;
1253 #endif
1254 }
1255 
1258 {
1259 #if defined(SIMDE_SSE2_NATIVE)
1260  return SIMDE__M64_C(_mm_cvtpd_pi32(a.n));
1261 #else
1262  simde__m64 r;
1264  for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
1265  r.i32[i] = (int32_t)a.f64[i];
1266  }
1267  return r;
1268 #endif
1269 }
1270 
1273 {
1274 #if defined(SIMDE_SSE2_NATIVE)
1275  return SIMDE__M128_C(_mm_cvtpd_ps(a.n));
1276 #else
1277  simde__m128 r;
1279  for (size_t i = 0; i < (sizeof(a.f64) / sizeof(a.f64[0])); i++) {
1280  r.f32[i] = (simde_float32)a.f64[i];
1281  }
1282  return r;
1283 #endif
1284 }
1285 
1288 {
1289 #if defined(SIMDE_SSE2_NATIVE)
1290  return SIMDE__M128D_C(_mm_cvtpi32_pd(a.n));
1291 #else
1292  simde__m128d r;
1294  for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
1295  r.f64[i] = (simde_float64)a.i32[i];
1296  }
1297  return r;
1298 #endif
1299 }
1300 
1303 {
1304 #if defined(SIMDE_SSE2_NATIVE)
1305  return SIMDE__M128I_C(_mm_cvtps_epi32(a.n));
1306 #elif defined(SIMDE_SSE2_NEON)
1307 /* The default rounding mode on SSE is 'round to even', which ArmV7
1308  does not support! It is supported on ARMv8 however. */
1309 #if defined(SIMDE_ARCH_AARCH64)
1310  return SIMDE__M128I_NEON_C(i32, vcvtnq_s32_f32(a.neon_f32));
1311 #else
1312  uint32x4_t signmask = vdupq_n_u32(0x80000000);
1313  float32x4_t half = vbslq_f32(signmask, a.neon_f32,
1314  vdupq_n_f32(0.5f)); /* +/- 0.5 */
1315  int32x4_t r_normal = vcvtq_s32_f32(
1316  vaddq_f32(a.neon_f32, half)); /* round to integer: [a + 0.5]*/
1317  int32x4_t r_trunc =
1318  vcvtq_s32_f32(a.neon_f32); /* truncate to integer: [a] */
1319  int32x4_t plusone = vshrq_n_s32(vnegq_s32(r_trunc), 31); /* 1 or 0 */
1320  int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
1321  vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
1322  float32x4_t delta = vsubq_f32(
1323  a.neon_f32,
1324  vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
1325  uint32x4_t is_delta_half =
1326  vceqq_f32(delta, half); /* delta == +/- 0.5 */
1327  return SIMDE__M128I_NEON_C(i32,
1328  vbslq_s32(is_delta_half, r_even, r_normal));
1329 #endif
1330 #else
1331  simde__m128i r;
1333  for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
1334  r.i32[i] = (int32_t)a.f32[i];
1335  }
1336  return r;
1337 #endif
1338 }
1339 
1342 {
1343 #if defined(SIMDE_SSE2_NATIVE)
1344  return SIMDE__M128D_C(_mm_cvtps_pd(a.n));
1345 #else
1346  simde__m128d r;
1348  for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
1349  r.f64[i] = a.f32[i];
1350  }
1351  return r;
1352 #endif
1353 }
1354 
1357 {
1358 #if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
1359  return _mm_cvtsd_f64(a.n);
1360 #else
1361  return a.f64[0];
1362 #endif
1363 }
1364 
1367 {
1368 #if defined(SIMDE_SSE2_NATIVE)
1369  return _mm_cvtsd_si32(a.n);
1370 #else
1371  return (int32_t)a.f64[0];
1372 #endif
1373 }
1374 
1377 {
1378 #if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
1379 #if defined(__PGI)
1380  return _mm_cvtsd_si64x(a.n);
1381 #else
1382  return _mm_cvtsd_si64(a.n);
1383 #endif
1384 #else
1385  return (int32_t)a.f64[0];
1386 #endif
1387 }
1388 #define simde_mm_cvtsd_si64x(a) simde_mm_cvtsd_si64(a)
1389 
1392 {
1393 #if defined(SIMDE_SSE2_NATIVE)
1394  return SIMDE__M128_C(_mm_cvtsd_ss(a.n, b.n));
1395 #else
1396  simde__m128 r;
1397 
1398  r.f32[0] = (simde_float32)b.f64[0];
1399 
1401  for (size_t i = 1; i < (sizeof(r) / sizeof(r.i32[0])); i++) {
1402  r.i32[i] = a.i32[i];
1403  }
1404 
1405  return r;
1406 #endif
1407 }
1408 
1411 {
1412 #if defined(SIMDE_SSE2_NATIVE)
1413  return _mm_cvtsi128_si32(a.n);
1414 #elif defined(SIMDE_SSE2_NEON)
1415  return vgetq_lane_s32(a.neon_i32, 0);
1416 #else
1417  return a.i32[0];
1418 #endif
1419 }
1420 
1423 {
1424 #if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
1425 #if defined(__PGI)
1426  return _mm_cvtsi128_si64x(a.n);
1427 #else
1428  return _mm_cvtsi128_si64(a.n);
1429 #endif
1430 #else
1431  return a.i64[0];
1432 #endif
1433 }
1434 #define simde_mm_cvtsi128_si64x(a) simde_mm_cvtsi128_si64(a)
1435 
1438 {
1439 #if defined(SIMDE_SSE2_NATIVE)
1440  return SIMDE__M128D_C(_mm_cvtsi32_sd(a.n, b));
1441 #else
1442  simde__m128d r;
1443 
1444  r.f64[0] = (simde_float64)b;
1445  r.i64[1] = a.i64[1];
1446 
1447  return r;
1448 #endif
1449 }
1450 
1453 {
1454  simde__m128i r;
1455 
1456 #if defined(SIMDE_SSE2_NATIVE)
1457  r.n = _mm_cvtsi32_si128(a);
1458 #elif defined(SIMDE_SSE2_NEON)
1459  r.neon_i32 = vsetq_lane_s32(a, vdupq_n_s32(0), 0);
1460 #else
1461  r.i32[0] = a;
1462  r.i32[1] = 0;
1463  r.i32[2] = 0;
1464  r.i32[3] = 0;
1465 #endif
1466 
1467  return r;
1468 }
1469 
1472 {
1473  simde__m128d r;
1474 
1475 #if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
1476 #if !defined(__PGI)
1477  r.n = _mm_cvtsi64_sd(a.n, b);
1478 #else
1479  r.n = _mm_cvtsi64x_sd(a.n, b);
1480 #endif
1481 #else
1482  r.f64[0] = (simde_float64)b;
1483  r.f64[1] = a.f64[1];
1484 #endif
1485 
1486  return r;
1487 }
1488 #define simde_mm_cvtsi64x_sd(a, b) simde_mm_cvtsi64(a, b)
1489 
1492 {
1493  simde__m128i r;
1494 
1495 #if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
1496 #if !defined(__PGI)
1497  r.n = _mm_cvtsi64_si128(a);
1498 #else
1499  r.n = _mm_cvtsi64x_si128(a);
1500 #endif
1501 #else
1502  r.i64[0] = a;
1503  r.i64[1] = 0;
1504 #endif
1505 
1506  return r;
1507 }
1508 #define simde_mm_cvtsi64x_si128(a) simde_mm_cvtsi64_si128(a)
1509 
1512 {
1513  simde__m128d r;
1514 
1515 #if defined(SIMDE_SSE2_NATIVE)
1516  r.n = _mm_cvtss_sd(a.n, b.n);
1517 #else
1518  r.f64[0] = b.f32[0];
1519  r.i64[1] = a.i64[1];
1520 #endif
1521 
1522  return r;
1523 }
1524 
1527 {
1528  simde__m128i r;
1529 
1530 #if defined(SIMDE_SSE2_NATIVE)
1531  r.n = _mm_cvttpd_epi32(a.n);
1532 #else
1533  for (size_t i = 0; i < (sizeof(a.f64) / sizeof(a.f64[0])); i++) {
1534  r.i32[i] = (int32_t)trunc(a.f64[i]);
1535  }
1536 #endif
1537 
1538  return r;
1539 }
1540 
1543 {
1544  simde__m64 r;
1545 
1546 #if defined(SIMDE_SSE2_NATIVE)
1547  r.n = _mm_cvttpd_pi32(a.n);
1548 #else
1549  for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
1550  r.i32[i] = (int32_t)trunc(a.f64[i]);
1551  }
1552 #endif
1553 
1554  return r;
1555 }
1556 
1559 {
1560  simde__m128i r;
1561 
1562 #if defined(SIMDE_SSE2_NATIVE)
1563  r.n = _mm_cvttps_epi32(a.n);
1564 #elif defined(SIMDE_SSE2_NEON)
1565  r.neon_i32 = vcvtq_s32_f32(a.neon_f32);
1566 #else
1567  for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
1568  r.i32[i] = (int32_t)truncf(a.f32[i]);
1569  }
1570 #endif
1571 
1572  return r;
1573 }
1574 
1577 {
1578 #if defined(SIMDE_SSE2_NATIVE)
1579  return _mm_cvttsd_si32(a.n);
1580 #else
1581  return (int32_t)trunc(a.f64[0]);
1582 #endif
1583 }
1584 
1587 {
1588 #if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
1589 #if !defined(__PGI)
1590  return _mm_cvttsd_si64(a.n);
1591 #else
1592  return _mm_cvttsd_si64x(a.n);
1593 #endif
1594 #else
1595  return (int64_t)trunc(a.f64[0]);
1596 #endif
1597 }
1598 #define simde_mm_cvttsd_si64x(a) simde_mm_cvttsd_si64(a)
1599 
1602 {
1603  simde__m128d r;
1604 
1605 #if defined(SIMDE_SSE2_NATIVE)
1606  r.n = _mm_div_pd(a.n, b.n);
1607 #else
1609  for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
1610  r.f64[i] = a.f64[i] / b.f64[i];
1611  }
1612 #endif
1613 
1614  return r;
1615 }
1616 
1619 {
1620  simde__m128d r;
1621 
1622 #if defined(SIMDE_SSE2_NATIVE)
1623  r.n = _mm_div_sd(a.n, b.n);
1624 #else
1625  r.f64[0] = a.f64[0] / b.f64[0];
1626  r.f64[1] = a.f64[1];
1627 #endif
1628 
1629  return r;
1630 }
1631 
1633 int32_t simde_mm_extract_epi16(simde__m128i a, const int imm8)
1634 {
1635  return a.u16[imm8 & 7];
1636 }
1637 #if defined(SIMDE_SSE2_NATIVE) && \
1638  (!defined(SIMDE__REALLY_GCC) || HEDLEY_GCC_VERSION_CHECK(4, 6, 0))
1639 #define simde_mm_extract_epi16(a, imm8) _mm_extract_epi16(a.n, imm8)
1640 #elif defined(SIMDE_SSE2_NEON)
1641 #define simde_mm_extract_epi16(a, imm8) \
1642  (vgetq_lane_s16((a).neon_i16, (imm8)) & ((int32_t)UINT32_C(0x0000ffff)))
1643 #endif
1644 
1646 simde__m128i simde_mm_insert_epi16(simde__m128i a, int32_t i, const int imm8)
1647 {
1648  a.u16[imm8 & 7] = (int16_t)i;
1649  return a;
1650 }
1651 #if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
1652 #define simde_mm_insert_epi16(a, i, imm8) \
1653  SIMDE__M128I_C(_mm_insert_epi16((a).n, (i), (imm8)))
1654 #elif defined(SIMDE_SSE2_NEON)
1655 #define simde_mm_insert_epi16(a, i, imm8) \
1656  SIMDE__M128I_NEON_C(i16, vsetq_lane_s16((i), a.neon_i16, (imm8)))
1657 #endif
1658 
1662 {
1663  simde__m128d r;
1664 
1665  simde_assert_aligned(16, mem_addr);
1666 
1667 #if defined(SIMDE_SSE2_NATIVE)
1668  r.n = _mm_load_pd(mem_addr);
1669 #elif defined(SIMDE_SSE2_NEON)
1670  r.neon_u32 = vld1q_u32((uint32_t const *)mem_addr);
1671 #else
1672  SIMDE__ASSUME_ALIGNED(mem_addr, 16);
1673  memcpy(&r, mem_addr, sizeof(r));
1674 #endif
1675 
1676  return r;
1677 }
1678 
1681 {
1682  simde__m128d r;
1683 
1684 #if defined(SIMDE_SSE2_NATIVE)
1685  r.n = _mm_load_pd1(mem_addr);
1686 #else
1687  r.f64[0] = *mem_addr;
1688  r.f64[1] = *mem_addr;
1689 #endif
1690 
1691  return r;
1692 }
1693 #define simde_mm_load1_pd(mem_addr) simde_mm_load_pd1(mem_addr)
1694 
1697 {
1698  simde__m128d r;
1699 
1700 #if defined(SIMDE_SSE2_NATIVE)
1701  r.n = _mm_load_sd(mem_addr);
1702 #else
1703  memcpy(&r, mem_addr, sizeof(simde_float64));
1704  r.u64[1] = 0;
1705 #endif
1706 
1707  return r;
1708 }
1709 
1712 {
1713  simde__m128i r;
1714 
1715  simde_assert_aligned(16, mem_addr);
1716 
1717 #if defined(SIMDE_SSE2_NATIVE)
1718  r.n = _mm_load_si128(&(mem_addr->n));
1719 #elif defined(SIMDE_SSE2_NEON)
1720  r.neon_i32 = vld1q_s32((int32_t const *)mem_addr);
1721 #else
1722  SIMDE__ASSUME_ALIGNED(mem_addr, 16);
1723  memcpy(&r, mem_addr, sizeof(r));
1724 #endif
1725 
1726  return r;
1727 }
1728 
1731 {
1732  simde__m128d r;
1733 
1734 #if defined(SIMDE_SSE2_NATIVE)
1735  r.n = _mm_loadh_pd(a.n, mem_addr);
1736 #else
1737  simde_float64 t;
1738  memcpy(&t, mem_addr, sizeof(t));
1739  r.f64[0] = a.f64[0];
1740  r.f64[1] = t;
1741 #endif
1742 
1743  return r;
1744 }
1745 
1748 {
1749  simde__m128i r;
1750 
1751 #if defined(SIMDE_SSE2_NATIVE)
1752  r.n = _mm_loadl_epi64(&mem_addr->n);
1753 #elif defined(SIMDE_SSE2_NEON)
1754  r.neon_i32 = vcombine_s32(vld1_s32((int32_t const *)mem_addr),
1755  vcreate_s32(0));
1756 #else
1757  r.u64[0] = mem_addr->u64[0];
1758  r.u64[1] = 0;
1759 #endif
1760 
1761  return r;
1762 }
1763 
1766 {
1767  simde__m128d r;
1768 
1769 #if defined(SIMDE_SSE2_NATIVE)
1770  r.n = _mm_loadl_pd(a.n, mem_addr);
1771 #else
1772  memcpy(&r, mem_addr, sizeof(simde_float64));
1773  r.u64[1] = a.u64[1];
1774 #endif
1775 
1776  return r;
1777 }
1778 
1782 {
1783  simde__m128d r;
1784 
1785  simde_assert_aligned(16, mem_addr);
1786 
1787 #if defined(SIMDE_SSE2_NATIVE)
1788  r.n = _mm_loadr_pd(mem_addr);
1789 #else
1790  SIMDE__ASSUME_ALIGNED(mem_addr, 16);
1791  r.f64[0] = mem_addr[1];
1792  r.f64[1] = mem_addr[0];
1793 #endif
1794 
1795  return r;
1796 }
1797 
1801 {
1802  simde__m128d r;
1803 
1804 #if defined(SIMDE_SSE2_NATIVE)
1805  r.n = _mm_loadu_pd(mem_addr);
1806 #else
1807  simde_float64 l, h;
1808  memcpy(&l, &mem_addr[0], sizeof(l));
1809  memcpy(&h, &mem_addr[1], sizeof(h));
1810  r.f64[0] = l;
1811  r.f64[1] = h;
1812 #endif
1813 
1814  return r;
1815 }
1816 
1819 {
1820  simde__m128i r;
1821 
1822 #if defined(SIMDE_SSE2_NATIVE)
1823  r.n = _mm_loadu_si128(&((*mem_addr).n));
1824 #elif defined(SIMDE_SSE2_NEON)
1825  r.neon_i32 = vld1q_s32((int32_t const *)mem_addr);
1826 #else
1827  memcpy(&r, mem_addr, sizeof(r));
1828 #endif
1829 
1830  return r;
1831 }
1832 
1835 {
1836  simde__m128i r;
1837 
1838 #if defined(SIMDE_SSE2_NATIVE)
1839  r.n = _mm_madd_epi16(a.n, b.n);
1840 #elif defined(SIMDE_SSE2_NEON)
1841  int32x4_t pl =
1842  vmull_s16(vget_low_s16(a.neon_i16), vget_low_s16(b.neon_i16));
1843  int32x4_t ph =
1844  vmull_s16(vget_high_s16(a.neon_i16), vget_high_s16(b.neon_i16));
1845  int32x2_t rl = vpadd_s32(vget_low_s32(pl), vget_high_s32(pl));
1846  int32x2_t rh = vpadd_s32(vget_low_s32(ph), vget_high_s32(ph));
1847  r.neon_i32 = vcombine_s32(rl, rh);
1848 #else
1850  for (size_t i = 0; i < (sizeof(r) / sizeof(r.i16[0])); i += 2) {
1851  r.i32[i / 2] =
1852  (a.i16[i] * b.i16[i]) + (a.i16[i + 1] * b.i16[i + 1]);
1853  }
1854 #endif
1855 
1856  return r;
1857 }
1858 
1861  int8_t mem_addr[HEDLEY_ARRAY_PARAM(16)])
1862 {
1863 #if defined(SIMDE_SSE2_NATIVE)
1864  _mm_maskmoveu_si128(a.n, mask.n, (char *)mem_addr);
1865 #else
1866  for (size_t i = 0; i < 16; i++) {
1867  if (mask.u8[i] & 0x80) {
1868  mem_addr[i] = a.i8[i];
1869  }
1870  }
1871 #endif
1872 }
1873 
1876 {
1877 #if defined(SIMDE_SSE2_NATIVE)
1878  return _mm_movemask_epi8(a.n);
1879 #elif defined(SIMDE_SSE2_NEON)
1880  uint8x16_t input = a.neon_u8;
1881  SIMDE_ALIGN(16)
1882  static const int8_t xr[8] = {-7, -6, -5, -4, -3, -2, -1, 0};
1883  uint8x8_t mask_and = vdup_n_u8(0x80);
1884  int8x8_t mask_shift = vld1_s8(xr);
1885 
1886  uint8x8_t lo = vget_low_u8(input);
1887  uint8x8_t hi = vget_high_u8(input);
1888 
1889  lo = vand_u8(lo, mask_and);
1890  lo = vshl_u8(lo, mask_shift);
1891 
1892  hi = vand_u8(hi, mask_and);
1893  hi = vshl_u8(hi, mask_shift);
1894 
1895  lo = vpadd_u8(lo, lo);
1896  lo = vpadd_u8(lo, lo);
1897  lo = vpadd_u8(lo, lo);
1898 
1899  hi = vpadd_u8(hi, hi);
1900  hi = vpadd_u8(hi, hi);
1901  hi = vpadd_u8(hi, hi);
1902 
1903  return ((hi[0] << 8) | (lo[0] & 0xFF));
1904 #else
1905  int32_t r = 0;
1907  for (size_t i = 0; i < 16; i++) {
1908  r |= (a.u8[15 - i] >> 7) << (15 - i);
1909  }
1910  return r;
1911 #endif
1912 }
1913 
1916 {
1917 #if defined(SIMDE_SSE2_NATIVE)
1918  return _mm_movemask_pd(a.n);
1919 #else
1920  int32_t r = 0;
1922  for (size_t i = 0; i < (sizeof(a.u64) / sizeof(a.u64[0])); i++) {
1923  r |= (a.u64[i] >> 63) << i;
1924  }
1925  return r;
1926 #endif
1927 }
1928 
1931 {
1932  simde__m64 r;
1933 
1934 #if defined(SIMDE_SSE2_NATIVE)
1935  r.n = _mm_movepi64_pi64(a.n);
1936 #else
1937  r.i64[0] = a.i64[0];
1938 #endif
1939 
1940  return r;
1941 }
1942 
1945 {
1946  simde__m128i r;
1947 
1948 #if defined(SIMDE_SSE2_NATIVE)
1949  r.n = _mm_movpi64_epi64(a.n);
1950 #else
1951  r.i64[0] = a.i64[0];
1952  r.i64[1] = 0;
1953 #endif
1954 
1955  return r;
1956 }
1957 
1960 {
1961  simde__m128i r;
1962 
1963 #if defined(SIMDE_SSE2_NATIVE)
1964  r.n = _mm_min_epi16(a.n, b.n);
1965 #elif defined(SIMDE_SSE2_NEON)
1966  r.neon_i16 = vminq_s16(a.neon_i16, b.neon_i16);
1967 #else
1969  for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
1970  r.i16[i] = (a.i16[i] < b.i16[i]) ? a.i16[i] : b.i16[i];
1971  }
1972 #endif
1973 
1974  return r;
1975 }
1976 
1979 {
1980  simde__m128i r;
1981 
1982 #if defined(SIMDE_SSE2_NATIVE)
1983  r.n = _mm_min_epu8(a.n, b.n);
1984 #elif defined(SIMDE_SSE2_NEON)
1985  r.neon_u8 = vminq_u8(a.neon_u8, b.neon_u8);
1986 #else
1988  for (size_t i = 0; i < (sizeof(r.u8) / sizeof(r.u8[0])); i++) {
1989  r.u8[i] = (a.u8[i] < b.u8[i]) ? a.u8[i] : b.u8[i];
1990  }
1991 #endif
1992 
1993  return r;
1994 }
1995 
1998 {
1999  simde__m128d r;
2000 
2001 #if defined(SIMDE_SSE2_NATIVE)
2002  r.n = _mm_min_pd(a.n, b.n);
2003 #else
2005  for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
2006  r.f64[i] = (a.f64[i] < b.f64[i]) ? a.f64[i] : b.f64[i];
2007  }
2008 #endif
2009 
2010  return r;
2011 }
2012 
2015 {
2016  simde__m128d r;
2017 
2018 #if defined(SIMDE_SSE2_NATIVE)
2019  r.n = _mm_min_sd(a.n, b.n);
2020 #else
2021  r.f64[0] = (a.f64[0] < b.f64[0]) ? a.f64[0] : b.f64[0];
2022  r.f64[1] = a.f64[1];
2023 #endif
2024 
2025  return r;
2026 }
2027 
2030 {
2031  simde__m128i r;
2032 
2033 #if defined(SIMDE_SSE2_NATIVE)
2034  r.n = _mm_max_epi16(a.n, b.n);
2035 #elif defined(SIMDE_SSE2_NEON)
2036  r.neon_i16 = vmaxq_s16(a.neon_i16, b.neon_i16);
2037 #else
2039  for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
2040  r.i16[i] = (a.i16[i] > b.i16[i]) ? a.i16[i] : b.i16[i];
2041  }
2042 #endif
2043 
2044  return r;
2045 }
2046 
2049 {
2050  simde__m128i r;
2051 
2052 #if defined(SIMDE_SSE2_NATIVE)
2053  r.n = _mm_max_epu8(a.n, b.n);
2054 #elif defined(SIMDE_SSE2_NEON)
2055  r.neon_u8 = vmaxq_u8(a.neon_u8, b.neon_u8);
2056 #else
2058  for (size_t i = 0; i < (sizeof(r.u8) / sizeof(r.u8[0])); i++) {
2059  r.u8[i] = (a.u8[i] > b.u8[i]) ? a.u8[i] : b.u8[i];
2060  }
2061 #endif
2062 
2063  return r;
2064 }
2065 
2068 {
2069  simde__m128d r;
2070 
2071 #if defined(SIMDE_SSE2_NATIVE)
2072  r.n = _mm_max_pd(a.n, b.n);
2073 #else
2075  for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
2076  r.f64[i] = (a.f64[i] > b.f64[i]) ? a.f64[i] : b.f64[i];
2077  }
2078 #endif
2079 
2080  return r;
2081 }
2082 
2085 {
2086  simde__m128d r;
2087 
2088 #if defined(SIMDE_SSE2_NATIVE)
2089  r.n = _mm_max_sd(a.n, b.n);
2090 #else
2091  r.f64[0] = (a.f64[0] > b.f64[0]) ? a.f64[0] : b.f64[0];
2092  r.f64[1] = a.f64[1];
2093 #endif
2094 
2095  return r;
2096 }
2097 
2100 {
2101  simde__m128i r;
2102 
2103 #if defined(SIMDE_SSE2_NATIVE)
2104  r.n = _mm_move_epi64(a.n);
2105 #elif defined(SIMDE_SSE2_NEON)
2106  r.neon_i64 = vsetq_lane_s64(0, a.neon_i64, 1);
2107 #else
2108  r.i64[0] = a.i64[0];
2109  r.i64[1] = 0;
2110 #endif
2111 
2112  return r;
2113 }
2114 
2117 {
2118  simde__m128d r;
2119 
2120 #if defined(SIMDE_SSE2_NATIVE)
2121  r.n = _mm_move_sd(a.n, b.n);
2122 #else
2123  r.f64[0] = b.f64[0];
2124  r.f64[1] = a.f64[1];
2125 #endif
2126 
2127  return r;
2128 }
2129 
2132 {
2133  simde__m128i r;
2134 
2135 #if defined(SIMDE_SSE2_NATIVE)
2136  r.n = _mm_mul_epu32(a.n, b.n);
2137 #else
2139  for (size_t i = 0; i < (sizeof(r.u64) / sizeof(r.u64[0])); i++) {
2140  r.u64[i] = ((uint64_t)a.u32[i * 2]) * ((uint64_t)b.u32[i * 2]);
2141  }
2142 #endif
2143 
2144  return r;
2145 }
2146 
2149 {
2150  simde__m128i r;
2151 
2153  for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
2154  r.i64[i] = a.i64[i] * b.i64[i];
2155  }
2156 
2157  return r;
2158 }
2159 
2162 {
2163  simde__m128i r;
2164 
2166  for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
2167  r.i64[i] = a.i64[i] % b.i64[i];
2168  }
2169 
2170  return r;
2171 }
2172 
2175 {
2176  simde__m128d r;
2177 
2178 #if defined(SIMDE_SSE2_NATIVE)
2179  r.n = _mm_mul_pd(a.n, b.n);
2180 #else
2182  for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
2183  r.f64[i] = a.f64[i] * b.f64[i];
2184  }
2185 #endif
2186 
2187  return r;
2188 }
2189 
2192 {
2193  simde__m128d r;
2194 
2195 #if defined(SIMDE_SSE2_NATIVE)
2196  r.n = _mm_mul_sd(a.n, b.n);
2197 #else
2198  r.f64[0] = a.f64[0] * b.f64[0];
2199  r.f64[1] = a.f64[1];
2200 #endif
2201 
2202  return r;
2203 }
2204 
2207 {
2208  simde__m64 r;
2209 
2210 #if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
2211  r.n = _mm_mul_su32(a.n, b.n);
2212 #else
2213  r.u64[0] = ((uint64_t)a.u32[0]) * ((uint64_t)b.u32[0]);
2214 #endif
2215 
2216  return r;
2217 }
2218 
2221 {
2222  simde__m128i r;
2223 
2224 #if defined(SIMDE_SSE2_NATIVE)
2225  r.n = _mm_mulhi_epi16(a.n, b.n);
2226 #elif defined(SIMDE_SSE2_NEON)
2227  int16x4_t a3210 = vget_low_s16(a.neon_i16);
2228  int16x4_t b3210 = vget_low_s16(b.neon_i16);
2229  int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */
2230  int16x4_t a7654 = vget_high_s16(a.neon_i16);
2231  int16x4_t b7654 = vget_high_s16(b.neon_i16);
2232  int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */
2233  uint16x8x2_t rv = vuzpq_u16(vreinterpretq_u16_s32(ab3210),
2234  vreinterpretq_u16_s32(ab7654));
2235  r.neon_u16 = rv.val[1];
2236 #else
2238  for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
2239  r.u16[i] = (uint16_t)(((uint32_t)(((int32_t)a.i16[i]) *
2240  ((int32_t)b.i16[i]))) >>
2241  16);
2242  }
2243 #endif
2244 
2245  return r;
2246 }
2247 
2250 {
2251  simde__m128i r;
2252 
2253 #if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
2254  r.n = _mm_mulhi_epu16(a.n, b.n);
2255 #else
2257  for (size_t i = 0; i < (sizeof(r.u16) / sizeof(r.u16[0])); i++) {
2258  r.u16[i] = (uint16_t)(
2259  (((uint32_t)a.u16[i]) * ((uint32_t)b.u16[i])) >> 16);
2260  }
2261 #endif
2262 
2263  return r;
2264 }
2265 
2268 {
2269  simde__m128i r;
2270 
2271 #if defined(SIMDE_SSE2_NATIVE)
2272  r.n = _mm_mullo_epi16(a.n, b.n);
2273 #elif defined(SIMDE_SSE2_NEON)
2274  r.neon_i16 = vmulq_s16(a.neon_i16, b.neon_i16);
2275 #else
2277  for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
2278  r.u16[i] = (uint16_t)(((uint32_t)(((int32_t)a.i16[i]) *
2279  ((int32_t)b.i16[i]))) &
2280  0xffff);
2281  }
2282 #endif
2283 
2284  return r;
2285 }
2286 
2289 {
2290  simde__m128d r;
2291 
2292 #if defined(SIMDE_SSE2_NATIVE)
2293  r.n = _mm_or_pd(a.n, b.n);
2294 #else
2296  for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
2297  r.i64[i] = a.i64[i] | b.i64[i];
2298  }
2299 #endif
2300 
2301  return r;
2302 }
2303 
2306 {
2307  simde__m128i r;
2308 
2309 #if defined(SIMDE_SSE2_NATIVE)
2310  r.n = _mm_or_si128(a.n, b.n);
2311 #elif defined(SIMDE_SSE2_NEON)
2312  r.neon_i32 = vorrq_s32(a.neon_i32, b.neon_i32);
2313 #else
2315  for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
2316  r.i64[i] = a.i64[i] | b.i64[i];
2317  }
2318 #endif
2319 
2320  return r;
2321 }
2322 
2325 {
2326  simde__m128i r;
2327 
2328 #if defined(SIMDE_SSE2_NATIVE)
2329  r.n = _mm_packs_epi16(a.n, b.n);
2330 #elif defined(SIMDE_SSE2_NEON)
2331  r.neon_i8 = vcombine_s8(vqmovn_s16(a.neon_i16), vqmovn_s16(b.neon_i16));
2332 #else
2334  for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
2335  r.i8[i] = (a.i16[i] > INT8_MAX)
2336  ? INT8_MAX
2337  : ((a.i16[i] < INT8_MIN)
2338  ? INT8_MIN
2339  : ((int8_t)a.i16[i]));
2340  r.i8[i + 8] = (b.i16[i] > INT8_MAX)
2341  ? INT8_MAX
2342  : ((b.i16[i] < INT8_MIN)
2343  ? INT8_MIN
2344  : ((int8_t)b.i16[i]));
2345  }
2346 #endif
2347 
2348  return r;
2349 }
2350 
2353 {
2354  simde__m128i r;
2355 
2356 #if defined(SIMDE_SSE2_NATIVE)
2357  r.n = _mm_packs_epi32(a.n, b.n);
2358 #elif defined(SIMDE_SSE2_NEON)
2359  r.neon_i16 =
2360  vcombine_s16(vqmovn_s32(a.neon_i32), vqmovn_s32(b.neon_i32));
2361 #else
2363  for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
2364  r.i16[i] = (a.i32[i] > INT16_MAX)
2365  ? INT16_MAX
2366  : ((a.i32[i] < INT16_MIN)
2367  ? INT16_MIN
2368  : ((int16_t)a.i32[i]));
2369  r.i16[i + 4] = (b.i32[i] > INT16_MAX)
2370  ? INT16_MAX
2371  : ((b.i32[i] < INT16_MIN)
2372  ? INT16_MIN
2373  : ((int16_t)b.i32[i]));
2374  }
2375 #endif
2376 
2377  return r;
2378 }
2379 
2382 {
2383  simde__m128i r;
2384 
2385 #if defined(SIMDE_SSE2_NATIVE)
2386  r.n = _mm_packus_epi16(a.n, b.n);
2387 #elif defined(SIMDE_SSE2_NEON)
2388  r.neon_u8 =
2389  vcombine_u8(vqmovun_s16(a.neon_i16), vqmovun_s16(b.neon_i16));
2390 #else
2392  for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
2393  r.u8[i] = (a.i16[i] > UINT8_MAX)
2394  ? UINT8_MAX
2395  : ((a.i16[i] < 0) ? 0 : ((int8_t)a.i16[i]));
2396  r.u8[i + 8] =
2397  (b.i16[i] > UINT8_MAX)
2398  ? UINT8_MAX
2399  : ((b.i16[i] < 0) ? 0 : ((int8_t)b.i16[i]));
2400  }
2401 #endif
2402 
2403  return r;
2404 }
2405 
2407 void simde_mm_pause(void)
2408 {
2409 #if defined(SIMDE_SSE2_NATIVE)
2410  _mm_pause();
2411 #endif
2412 }
2413 
2416 {
2417  simde__m128i r;
2418 
2419 #if defined(SIMDE_SSE2_NATIVE)
2420  r.n = _mm_sad_epu8(a.n, b.n);
2421 #else
2422  for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
2423  uint16_t tmp = 0;
2425  for (size_t j = 0; j < ((sizeof(r.u8) / sizeof(r.u8[0])) / 2);
2426  j++) {
2427  const size_t e = j + (i * 8);
2428  tmp += (a.u8[e] > b.u8[e]) ? (a.u8[e] - b.u8[e])
2429  : (b.u8[e] - a.u8[e]);
2430  }
2431  r.i64[i] = tmp;
2432  }
2433 #endif
2434 
2435  return r;
2436 }
2437 
2439 simde__m128i simde_mm_set_epi8(int8_t e15, int8_t e14, int8_t e13, int8_t e12,
2440  int8_t e11, int8_t e10, int8_t e9, int8_t e8,
2441  int8_t e7, int8_t e6, int8_t e5, int8_t e4,
2442  int8_t e3, int8_t e2, int8_t e1, int8_t e0)
2443 {
2444  simde__m128i r;
2445 
2446 #if defined(SIMDE_SSE2_NATIVE)
2447  r.n = _mm_set_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4,
2448  e3, e2, e1, e0);
2449 #else
2450  r.i8[0] = e0;
2451  r.i8[1] = e1;
2452  r.i8[2] = e2;
2453  r.i8[3] = e3;
2454  r.i8[4] = e4;
2455  r.i8[5] = e5;
2456  r.i8[6] = e6;
2457  r.i8[7] = e7;
2458  r.i8[8] = e8;
2459  r.i8[9] = e9;
2460  r.i8[10] = e10;
2461  r.i8[11] = e11;
2462  r.i8[12] = e12;
2463  r.i8[13] = e13;
2464  r.i8[14] = e14;
2465  r.i8[15] = e15;
2466 #endif
2467 
2468  return r;
2469 }
2470 
2472 simde__m128i simde_mm_set_epi16(int16_t e7, int16_t e6, int16_t e5, int16_t e4,
2473  int16_t e3, int16_t e2, int16_t e1, int16_t e0)
2474 {
2475  simde__m128i r;
2476 
2477 #if defined(SIMDE_SSE2_NATIVE)
2478  r.n = _mm_set_epi16(e7, e6, e5, e4, e3, e2, e1, e0);
2479 #elif defined(SIMDE_SSE2_NEON)
2480  SIMDE_ALIGN(16) int16_t data[8] = {e0, e1, e2, e3, e4, e5, e6, e7};
2481  r.neon_i16 = vld1q_s16(data);
2482 #else
2483  r.i16[0] = e0;
2484  r.i16[1] = e1;
2485  r.i16[2] = e2;
2486  r.i16[3] = e3;
2487  r.i16[4] = e4;
2488  r.i16[5] = e5;
2489  r.i16[6] = e6;
2490  r.i16[7] = e7;
2491 #endif
2492 
2493  return r;
2494 }
2495 
2497 simde__m128i simde_mm_set_epi32(int32_t e3, int32_t e2, int32_t e1, int32_t e0)
2498 {
2499  simde__m128i r;
2500 
2501 #if defined(SIMDE_SSE2_NATIVE)
2502  r.n = _mm_set_epi32(e3, e2, e1, e0);
2503 #elif defined(SIMDE_SSE2_NEON)
2504  SIMDE_ALIGN(16) int32_t data[4] = {e0, e1, e2, e3};
2505  r.neon_i32 = vld1q_s32(data);
2506 #else
2507  r.i32[0] = e0;
2508  r.i32[1] = e1;
2509  r.i32[2] = e2;
2510  r.i32[3] = e3;
2511 #endif
2512 
2513  return r;
2514 }
2515 
2518 {
2519  simde__m128i r;
2520 
2521 #if defined(SIMDE_SSE2_NATIVE)
2522  r.n = _mm_set_epi64(e1.n, e0.n);
2523 #else
2524  r.i64[0] = e0.i64[0];
2525  r.i64[1] = e1.i64[0];
2526 #endif
2527 
2528  return r;
2529 }
2530 
2532 simde__m128i simde_mm_set_epi64x(int64_t e1, int64_t e0)
2533 {
2534  simde__m128i r;
2535 
2536 #if defined(SIMDE_SSE2_NATIVE)
2537  r.n = _mm_set_epi64x(e1, e0);
2538 #elif defined(SIMDE_SSE2_NEON)
2539  r = SIMDE__M128I_NEON_C(i64,
2540  vcombine_s64(vdup_n_s64(e0), vdup_n_s64(e1)));
2541 #else
2542  r.i64[0] = e0;
2543  r.i64[1] = e1;
2544 #endif
2545 
2546  return r;
2547 }
2548 
2550 simde__m128i simde_x_mm_set_epu8(uint8_t e15, uint8_t e14, uint8_t e13,
2551  uint8_t e12, uint8_t e11, uint8_t e10,
2552  uint8_t e9, uint8_t e8, uint8_t e7, uint8_t e6,
2553  uint8_t e5, uint8_t e4, uint8_t e3, uint8_t e2,
2554  uint8_t e1, uint8_t e0)
2555 {
2556  simde__m128i r;
2557 
2558  r.u8[0] = e0;
2559  r.u8[1] = e1;
2560  r.u8[2] = e2;
2561  r.u8[3] = e3;
2562  r.u8[4] = e4;
2563  r.u8[5] = e5;
2564  r.u8[6] = e6;
2565  r.u8[7] = e7;
2566  r.u8[8] = e8;
2567  r.u8[9] = e9;
2568  r.u8[10] = e10;
2569  r.u8[11] = e11;
2570  r.u8[12] = e12;
2571  r.u8[13] = e13;
2572  r.u8[14] = e14;
2573  r.u8[15] = e15;
2574 
2575  return r;
2576 }
2577 
2579 simde__m128i simde_x_mm_set_epu16(uint16_t e7, uint16_t e6, uint16_t e5,
2580  uint16_t e4, uint16_t e3, uint16_t e2,
2581  uint16_t e1, uint16_t e0)
2582 {
2583  simde__m128i r;
2584 
2585  r.u16[0] = e0;
2586  r.u16[1] = e1;
2587  r.u16[2] = e2;
2588  r.u16[3] = e3;
2589  r.u16[4] = e4;
2590  r.u16[5] = e5;
2591  r.u16[6] = e6;
2592  r.u16[7] = e7;
2593 
2594  return r;
2595 }
2596 
2598 simde__m128i simde_x_mm_set_epu32(uint32_t e3, uint32_t e2, uint32_t e1,
2599  uint32_t e0)
2600 {
2601  simde__m128i r;
2602 
2603  r.u32[0] = e0;
2604  r.u32[1] = e1;
2605  r.u32[2] = e2;
2606  r.u32[3] = e3;
2607 
2608  return r;
2609 }
2610 
2612 simde__m128i simde_x_mm_set_epu64x(uint64_t e1, uint64_t e0)
2613 {
2614  simde__m128i r;
2615 
2616  r.u64[0] = e0;
2617  r.u64[1] = e1;
2618 
2619  return r;
2620 }
2621 
2624 {
2625  simde__m128d r;
2626 
2627 #if defined(SIMDE_SSE2_NATIVE)
2628  r.n = _mm_set_pd(e1, e0);
2629 #else
2630  r.f64[0] = e0;
2631  r.f64[1] = e1;
2632 #endif
2633 
2634  return r;
2635 }
2636 
2639 {
2640  simde__m128d r;
2641 
2642 #if defined(SIMDE_SSE2_NATIVE)
2643  r.n = _mm_set1_pd(a);
2644 #else
2645  r.f64[0] = a;
2646  r.f64[1] = a;
2647 #endif
2648 
2649  return r;
2650 }
2651 
2654 {
2655  simde__m128d r;
2656 
2657 #if defined(SIMDE_SSE2_NATIVE)
2658  r.n = _mm_set_sd(a);
2659 #else
2660  r.f64[0] = a;
2661  r.u64[1] = 0;
2662 #endif
2663 
2664  return r;
2665 }
2666 
2669 {
2670  simde__m128i r;
2671 
2672 #if defined(SIMDE_SSE2_NATIVE)
2673  r.n = _mm_set1_epi8(a);
2674 #elif defined(SIMDE_SSE2_NEON)
2675  r.neon_i8 = vdupq_n_s8(a);
2676 #else
2678  for (size_t i = 0; i < (sizeof(r.i8) / sizeof(r.i8[0])); i++) {
2679  r.i8[i] = a;
2680  }
2681 #endif
2682 
2683  return r;
2684 }
2685 
2688 {
2689  simde__m128i r;
2690 
2691 #if defined(SIMDE_SSE2_NATIVE)
2692  r.n = _mm_set1_epi16(a);
2693 #elif defined(SIMDE_SSE2_NEON)
2694  r.neon_i16 = vdupq_n_s16(a);
2695 #else
2697  for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
2698  r.i16[i] = a;
2699  }
2700 #endif
2701 
2702  return r;
2703 }
2704 
2707 {
2708  simde__m128i r;
2709 
2710 #if defined(SIMDE_SSE2_NATIVE)
2711  r.n = _mm_set1_epi32(a);
2712 #elif defined(SIMDE_SSE2_NEON)
2713  r.neon_i32 = vdupq_n_s32(a);
2714 #else
2716  for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
2717  r.i32[i] = a;
2718  }
2719 #endif
2720 
2721  return r;
2722 }
2723 
2726 {
2727  simde__m128i r;
2728 
2729 #if defined(SIMDE_SSE2_NATIVE)
2730  r.n = _mm_set1_epi64x(a);
2731 #elif defined(SIMDE_SSE2_NEON)
2732  r.neon_i64 = vmovq_n_s64(a);
2733 #else
2735  for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
2736  r.i64[i] = a;
2737  }
2738 #endif
2739 
2740  return r;
2741 }
2742 
2745 {
2746  simde__m128i r;
2747 
2748 #if defined(SIMDE_SSE2_NATIVE)
2749  r.n = _mm_set1_epi64(a.n);
2750 #else
2752  for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
2753  r.i64[i] = a.i64[0];
2754  }
2755 #endif
2756 
2757  return r;
2758 }
2759 
2762 {
2763  simde__m128d r;
2764 
2765 #if defined(SIMDE_SSE2_NATIVE)
2766  r.n = _mm_set1_pd(a);
2767 #else
2769  for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
2770  r.f64[i] = a;
2771  }
2772 #endif
2773 
2774  return r;
2775 }
2776 
2778 simde__m128i simde_mm_setr_epi8(int8_t e15, int8_t e14, int8_t e13, int8_t e12,
2779  int8_t e11, int8_t e10, int8_t e9, int8_t e8,
2780  int8_t e7, int8_t e6, int8_t e5, int8_t e4,
2781  int8_t e3, int8_t e2, int8_t e1, int8_t e0)
2782 {
2783  simde__m128i r;
2784 
2785 #if defined(SIMDE_SSE2_NATIVE)
2786  r.n = _mm_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5,
2787  e4, e3, e2, e1, e0);
2788 #elif defined(SIMDE_SSE2_NEON)
2789  int8_t t[] = {e15, e14, e13, e12, e11, e10, e9, e8,
2790  e7, e6, e5, e4, e3, e2, e1, e0};
2791  r.neon_i8 = vld1q_s8(t);
2792 #else
2793  r.i8[0] = e15;
2794  r.i8[1] = e14;
2795  r.i8[2] = e13;
2796  r.i8[3] = e12;
2797  r.i8[4] = e11;
2798  r.i8[5] = e10;
2799  r.i8[6] = e9;
2800  r.i8[7] = e8;
2801  r.i8[8] = e7;
2802  r.i8[9] = e6;
2803  r.i8[10] = e5;
2804  r.i8[11] = e4;
2805  r.i8[12] = e3;
2806  r.i8[13] = e2;
2807  r.i8[14] = e1;
2808  r.i8[15] = e0;
2809 #endif
2810 
2811  return r;
2812 }
2813 
2815 simde__m128i simde_mm_setr_epi16(int16_t e7, int16_t e6, int16_t e5, int16_t e4,
2816  int16_t e3, int16_t e2, int16_t e1, int16_t e0)
2817 {
2818  simde__m128i r;
2819 
2820 #if defined(SIMDE_SSE2_NATIVE)
2821  r.n = _mm_setr_epi16(e7, e6, e5, e4, e3, e2, e1, e0);
2822 #elif defined(SIMDE_SSE2_NEON)
2823  int16_t t[] = {e7, e6, e5, e4, e3, e2, e1, e0};
2824  r.neon_i16 = vld1q_s16(t);
2825 #else
2826  r.i16[0] = e7;
2827  r.i16[1] = e6;
2828  r.i16[2] = e5;
2829  r.i16[3] = e4;
2830  r.i16[4] = e3;
2831  r.i16[5] = e2;
2832  r.i16[6] = e1;
2833  r.i16[7] = e0;
2834 #endif
2835 
2836  return r;
2837 }
2838 
2840 simde__m128i simde_mm_setr_epi32(int32_t e3, int32_t e2, int32_t e1, int32_t e0)
2841 {
2842  simde__m128i r;
2843 
2844 #if defined(SIMDE_SSE2_NATIVE)
2845  r.n = _mm_setr_epi32(e3, e2, e1, e0);
2846 #elif defined(SIMDE_SSE2_NEON)
2847  int32_t t[] = {e3, e2, e1, e0};
2848  r.neon_i32 = vld1q_s32(t);
2849 #else
2850  r.i32[0] = e3;
2851  r.i32[1] = e2;
2852  r.i32[2] = e1;
2853  r.i32[3] = e0;
2854 #endif
2855 
2856  return r;
2857 }
2858 
2861 {
2862  simde__m128i r;
2863 
2864 #if defined(SIMDE_SSE2_NATIVE)
2865  r.n = _mm_setr_epi64(e1.n, e0.n);
2866 #elif defined(SIMDE_SSE2_NEON)
2867  r.neon_i64 = vcombine_s64(e1.neon_i64, e0.neon_i64);
2868 #else
2869  r.i64[0] = e1.i64[0];
2870  r.i64[1] = e0.i64[0];
2871 #endif
2872 
2873  return r;
2874 }
2875 
2878 {
2879  simde__m128d r;
2880 
2881 #if defined(SIMDE_SSE2_NATIVE)
2882  r.n = _mm_setr_pd(e1, e0);
2883 #else
2884  r.f64[0] = e1;
2885  r.f64[1] = e0;
2886 #endif
2887 
2888  return r;
2889 }
2890 
2893 {
2894  simde__m128d r;
2895 
2896 #if defined(SIMDE_SSE2_NATIVE)
2897  r.n = _mm_setzero_pd();
2898 #else
2899  r.u64[0] = 0;
2900  r.u64[1] = 0;
2901 #endif
2902 
2903  return r;
2904 }
2905 
2908 {
2909  simde__m128i r;
2910 
2911 #if defined(SIMDE_SSE2_NATIVE)
2912  r.n = _mm_setzero_si128();
2913 #elif defined(SIMDE_SSE2_NEON)
2914  r.neon_i32 = vdupq_n_s32(0);
2915 #else
2916  r.u64[0] = 0;
2917  r.u64[1] = 0;
2918 #endif
2919 
2920  return r;
2921 }
2922 
2925 {
2926  simde__m128i r;
2927 
2928  for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
2929  r.i32[i] = a.i32[(imm8 >> (i * 2)) & 3];
2930  }
2931 
2932  return r;
2933 }
2934 #if defined(SIMDE_SSE2_NATIVE)
2935 #define simde_mm_shuffle_epi32(a, imm8) \
2936  SIMDE__M128I_C(_mm_shuffle_epi32((a).n, (imm8)))
2937 #elif defined(SIMDE__SHUFFLE_VECTOR)
2938 #define simde_mm_shuffle_epi32(a, imm8) \
2939  ({ \
2940  const simde__m128i simde__tmp_a_ = a; \
2941  (simde__m128i){.i32 = SIMDE__SHUFFLE_VECTOR( \
2942  32, 16, (simde__tmp_a_).i32, \
2943  (simde__tmp_a_).i32, ((imm8)) & 3, \
2944  ((imm8) >> 2) & 3, ((imm8) >> 4) & 3, \
2945  ((imm8) >> 6) & 3)}; \
2946  })
2947 #endif
2948 
2951 {
2952  simde__m128d r;
2953 
2954  r.f64[0] = ((imm8 & 1) == 0) ? a.f64[0] : a.f64[1];
2955  r.f64[1] = ((imm8 & 2) == 0) ? b.f64[0] : b.f64[1];
2956 
2957  return r;
2958 }
2959 #if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI)
2960 #define simde_mm_shuffle_pd(a, b, imm8) \
2961  SIMDE__M128D_C(_mm_shuffle_pd((a).n, (b).n, (imm8)))
2962 #elif defined(SIMDE__SHUFFLE_VECTOR)
2963 #define simde_mm_shuffle_pd(a, b, imm8) \
2964  ({ \
2965  (simde__m128d){.f64 = SIMDE__SHUFFLE_VECTOR( \
2966  64, 16, (a).f64, (b).f64, \
2967  (((imm8)) & 1), \
2968  (((imm8) >> 1) & 1) + 2)}; \
2969  })
2970 #endif
2971 
2974 {
2975  simde__m128i r;
2976 
2977  r.i64[0] = a.i64[0];
2978  for (size_t i = 4; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
2979  r.i16[i] = a.i16[((imm8 >> ((i - 4) * 2)) & 3) + 4];
2980  }
2981 
2982  return r;
2983 }
2984 #if defined(SIMDE_SSE2_NATIVE)
2985 #define simde_mm_shufflehi_epi16(a, imm8) \
2986  SIMDE__M128I_C(_mm_shufflehi_epi16((a).n, (imm8)))
2987 #elif defined(SIMDE__SHUFFLE_VECTOR)
2988 #define simde_mm_shufflehi_epi16(a, imm8) \
2989  ({ \
2990  const simde__m128i simde__tmp_a_ = a; \
2991  (simde__m128i){.i16 = SIMDE__SHUFFLE_VECTOR( \
2992  16, 16, (simde__tmp_a_).i16, \
2993  (simde__tmp_a_).i16, 0, 1, 2, 3, \
2994  (((imm8)) & 3) + 4, \
2995  (((imm8) >> 2) & 3) + 4, \
2996  (((imm8) >> 4) & 3) + 4, \
2997  (((imm8) >> 6) & 3) + 4)}; \
2998  })
2999 #endif
3000 
3003 {
3004  simde__m128i r;
3005 
3006  for (size_t i = 0; i < ((sizeof(r.i16) / sizeof(r.i16[0])) / 2); i++) {
3007  r.i16[i] = a.i16[((imm8 >> (i * 2)) & 3)];
3008  }
3009  r.i64[1] = a.i64[1];
3010 
3011  return r;
3012 }
3013 #if defined(SIMDE_SSE2_NATIVE)
3014 #define simde_mm_shufflelo_epi16(a, imm8) \
3015  SIMDE__M128I_C(_mm_shufflelo_epi16((a).n, (imm8)))
3016 #elif defined(SIMDE__SHUFFLE_VECTOR)
3017 #define simde_mm_shufflelo_epi16(a, imm8) \
3018  ({ \
3019  const simde__m128i simde__tmp_a_ = a; \
3020  (simde__m128i){.i16 = SIMDE__SHUFFLE_VECTOR( \
3021  16, 16, (simde__tmp_a_).i16, \
3022  (simde__tmp_a_).i16, (((imm8)) & 3), \
3023  (((imm8) >> 2) & 3), \
3024  (((imm8) >> 4) & 3), \
3025  (((imm8) >> 6) & 3), 4, 5, 6, 7)}; \
3026  })
3027 #endif
3028 
3031 {
3032 #if defined(SIMDE_SSE2_NATIVE)
3033  return SIMDE__M128I_C(_mm_sll_epi16(a.n, count.n));
3034 #else
3035  simde__m128i r;
3036 
3037  if (count.u64[0] > 15)
3038  return simde_mm_setzero_si128();
3039  const int s = (int)(count.u64[0]);
3040 
3042  for (size_t i = 0; i < (sizeof(r.u16) / sizeof(r.u16[0])); i++) {
3043  r.u16[i] = a.u16[i] << s;
3044  }
3045  return r;
3046 #endif
3047 }
3048 
3051 {
3052 #if defined(SIMDE_SSE2_NATIVE)
3053  return SIMDE__M128I_C(_mm_sll_epi32(a.n, count.n));
3054 #else
3055  simde__m128i r;
3056 
3057  if (count.u64[0] > 31)
3058  return simde_mm_setzero_si128();
3059  const int s = (int)(count.u64[0]);
3060 
3062  for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
3063  r.i32[i] = a.i32[i] << s;
3064  }
3065  return r;
3066 #endif
3067 }
3068 
3071 {
3072 #if defined(SIMDE_SSE2_NATIVE)
3073  return SIMDE__M128I_C(_mm_sll_epi64(a.n, count.n));
3074 #else
3075  simde__m128i r;
3076 
3077  if (count.u64[0] > 63)
3078  return simde_mm_setzero_si128();
3079  const int s = (int)(count.u64[0]);
3080 
3082  for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
3083  r.i64[i] = a.i64[i] << s;
3084  }
3085  return r;
3086 #endif
3087 }
3088 
3091 {
3092 #if defined(SIMDE_SSE2_NATIVE)
3093  return SIMDE__M128D_C(_mm_sqrt_pd(a.n));
3094 #else
3095  simde__m128d r;
3096 
3098  for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
3099  r.f64[i] = sqrt(a.f64[i]);
3100  }
3101 
3102  return r;
3103 #endif
3104 }
3105 
3108 {
3109 #if defined(SIMDE_SSE2_NATIVE)
3110  return SIMDE__M128D_C(_mm_sqrt_sd(a.n, b.n));
3111 #else
3112  simde__m128d r;
3113  r.f64[0] = sqrt(b.f64[0]);
3114  r.f64[1] = a.f64[1];
3115  return r;
3116 #endif
3117 }
3118 
3121 {
3122 #if defined(SIMDE_SSE2_NATIVE)
3123  return SIMDE__M128I_C(_mm_srl_epi16(a.n, count.n));
3124 #else
3125  simde__m128i r;
3126 
3127  if (count.u64[0] > 15)
3128  return simde_mm_setzero_si128();
3129  const int s = (int)(count.u64[0]);
3130 
3132  for (size_t i = 0; i < (sizeof(r.u16) / sizeof(r.u16[0])); i++) {
3133  r.u16[i] = a.u16[i] >> s;
3134  }
3135  return r;
3136 #endif
3137 }
3138 
3141 {
3142 #if defined(SIMDE_SSE2_NATIVE)
3143  return SIMDE__M128I_C(_mm_srl_epi32(a.n, count.n));
3144 #else
3145  simde__m128i r;
3146 
3147  if (count.u64[0] > 31)
3148  return simde_mm_setzero_si128();
3149  const int s = (int)(count.u64[0]);
3150 
3152  for (size_t i = 0; i < (sizeof(r.u32) / sizeof(r.u32[0])); i++) {
3153  r.u32[i] = a.u32[i] >> s;
3154  }
3155  return r;
3156 #endif
3157 }
3158 
3161 {
3162 #if defined(SIMDE_SSE2_NATIVE)
3163  return SIMDE__M128I_C(_mm_srl_epi64(a.n, count.n));
3164 #else
3165  simde__m128i r;
3166 
3167  if (count.u64[0] > 31)
3168  return simde_mm_setzero_si128();
3169  const int s = (int)(count.u64[0]);
3170 
3172  for (size_t i = 0; i < (sizeof(r.u64) / sizeof(r.u64[0])); i++) {
3173  r.u64[i] = a.u64[i] >> s;
3174  }
3175  return r;
3176 #endif
3177 }
3178 
3181 {
3182  simde__m128i r;
3183 
3184  const uint16_t m =
3185  (uint16_t)((~0U) << ((sizeof(int16_t) * CHAR_BIT) - imm8));
3186 
3188  for (size_t i = 0; i < (sizeof(r) / sizeof(r.u16[0])); i++) {
3189  const uint16_t is_neg = ((uint16_t)(
3190  ((a.u16[i]) >> ((sizeof(int16_t) * CHAR_BIT) - 1))));
3191  r.u16[i] = (a.u16[i] >> imm8) | (m * is_neg);
3192  }
3193 
3194  return r;
3195 }
3196 #if defined(SIMDE_SSE2_NATIVE)
3197 #define simde_mm_srai_epi16(a, imm8) \
3198  SIMDE__M128I_C(_mm_srai_epi16((a).n, (imm8)));
3199 #endif
3200 
3203 {
3204  simde__m128i r;
3205 
3206  const uint32_t m =
3207  (uint32_t)((~0U) << ((sizeof(int) * CHAR_BIT) - imm8));
3209  for (size_t i = 0; i < (sizeof(r) / sizeof(r.u32[0])); i++) {
3210  uint32_t is_neg = ((uint32_t)(
3211  ((a.u32[i]) >> ((sizeof(int32_t) * CHAR_BIT) - 1))));
3212  r.u32[i] = (a.u32[i] >> imm8) | (m * is_neg);
3213  }
3214 
3215  return r;
3216 }
3217 #if defined(SIMDE_SSE2_NATIVE)
3218 #define simde_mm_srai_epi32(a, imm8) \
3219  SIMDE__M128I_C(_mm_srai_epi32((a).n, (imm8)))
3220 #elif defined(SIMDE_SSE2_NEON)
3221 #define simde_mm_srai_epi32(a, imm8) \
3222  SIMDE__M128I_NEON_C( \
3223  i32, \
3224  ((imm8) <= 0) \
3225  ? (a.neon_i32) \
3226  : (((imm8) > 31) \
3227  ? (vshrq_n_s32(vshrq_n_s32(a.neon_i32, 16), \
3228  16)) \
3229  : (vshrq_n_s32(a.neon_i32, (imm8)))))
3230 #endif
3231 
3234 {
3235 #if defined(SIMDE_SSE2_NATIVE)
3236  return SIMDE__M128I_C(_mm_sra_epi16(a.n, count.n));
3237 #else
3238  simde__m128i r;
3239  int cnt = (int)count.i64[0];
3240 
3241  if (cnt > 15 || cnt < 0) {
3242  for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0]));
3243  i++) {
3244  r.u16[i] = (a.i16[i] < 0) ? 0xffff : 0x0000;
3245  }
3246  } else {
3247  const uint16_t m = (uint16_t)(
3248  (~0U) << ((sizeof(int16_t) * CHAR_BIT) - cnt));
3249  for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0]));
3250  i++) {
3251  const uint16_t is_neg = a.i16[i] < 0;
3252  r.u16[i] = (a.u16[i] >> cnt) | (m * is_neg);
3253  }
3254  }
3255 
3256  return r;
3257 #endif
3258 }
3259 
3262 {
3263 #if defined(SIMDE_SSE2_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_MM_SRA_EPI32)
3264  return SIMDE__M128I_C(_mm_sra_epi32(a.n, count.n));
3265 #else
3266  simde__m128i r;
3267  const uint64_t cnt = count.u64[0];
3268 
3269  if (cnt > 31) {
3270  for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0]));
3271  i++) {
3272  r.u32[i] = (a.i32[i] < 0) ? UINT32_MAX : 0;
3273  }
3274  } else if (cnt == 0) {
3275  memcpy(&r, &a, sizeof(r));
3276  } else {
3277  const uint32_t m = (uint32_t)(
3278  (~0U) << ((sizeof(int32_t) * CHAR_BIT) - cnt));
3279  for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0]));
3280  i++) {
3281  const uint32_t is_neg = a.i32[i] < 0;
3282  r.u32[i] = (a.u32[i] >> cnt) | (m * is_neg);
3283  }
3284  }
3285 
3286  return r;
3287 #endif
3288 }
3289 
3292 {
3293  simde__m128i r;
3294  const int s = (imm8 > ((int)sizeof(r.i16[0]) * CHAR_BIT) - 1) ? 0
3295  : imm8;
3297  for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
3298  r.i16[i] = a.i16[i] << s;
3299  }
3300  return r;
3301 }
3302 #if defined(SIMDE_SSE2_NATIVE)
3303 #define simde_mm_slli_epi16(a, imm8) SIMDE__M128I_C(_mm_slli_epi16(a.n, imm8));
3304 #elif defined(SIMDE_SSE2_NEON)
3305 #define simde_mm_slli_epi16(a, imm8) \
3306  SIMDE__M128I_NEON_C( \
3307  i16, ((imm8) <= 0) \
3308  ? ((a).neon_i16) \
3309  : (((imm8) > 31) ? (vdupq_n_s16(0)) \
3310  : (vshlq_n_s16((a).neon_i16, \
3311  (imm8)))))
3312 #endif
3313 
3316 {
3317  simde__m128i r;
3318  const int s = (imm8 > ((int)sizeof(r.i32[0]) * CHAR_BIT) - 1) ? 0
3319  : imm8;
3321  for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
3322  r.i32[i] = a.i32[i] << s;
3323  }
3324  return r;
3325 }
3326 #if defined(SIMDE_SSE2_NATIVE)
3327 #define simde_mm_slli_epi32(a, imm8) SIMDE__M128I_C(_mm_slli_epi32(a.n, imm8));
3328 #elif defined(SIMDE_SSE2_NEON)
3329 #define simde_mm_slli_epi32(a, imm8) \
3330  SIMDE__M128I_NEON_C( \
3331  i32, ((imm8) <= 0) \
3332  ? ((a).neon_i32) \
3333  : (((imm8) > 31) ? (vdupq_n_s32(0)) \
3334  : (vshlq_n_s32((a).neon_i32, \
3335  (imm8)))))
3336 #endif
3337 
3340 {
3341  simde__m128i r;
3342  const int s = (imm8 > ((int)sizeof(r.i64[0]) * CHAR_BIT) - 1) ? 0
3343  : imm8;
3345  for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
3346  r.i64[i] = a.i64[i] << s;
3347  }
3348  return r;
3349 }
3350 #if defined(SIMDE_SSE2_NATIVE)
3351 #define simde_mm_slli_epi64(a, imm8) SIMDE__M128I_C(_mm_slli_epi64(a.n, imm8));
3352 #endif
3353 
3356 {
3357  simde__m128i r;
3358  const int s = (imm8 > ((int)sizeof(r.i16[0]) * CHAR_BIT) - 1) ? 0
3359  : imm8;
3361  for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
3362  r.u16[i] = a.u16[i] >> s;
3363  }
3364  return r;
3365 }
3366 #if defined(SIMDE_SSE2_NATIVE)
3367 #define simde_mm_srli_epi16(a, imm8) SIMDE__M128I_C(_mm_srli_epi16(a.n, imm8));
3368 #elif defined(SIMDE_SSE2_NEON)
3369 #define simde_mm_srli_epi16(a, imm8) \
3370  SIMDE__M128I_NEON_C( \
3371  u16, ((imm8) <= 0) \
3372  ? ((a).neon_u16) \
3373  : (((imm8) > 31) ? (vdupq_n_u16(0)) \
3374  : (vshrq_n_u16((a).neon_u16, \
3375  (imm8)))))
3376 #endif
3377 
3380 {
3381  simde__m128i r;
3382  const int s = (imm8 > ((int)sizeof(r.i32[0]) * CHAR_BIT) - 1) ? 0
3383  : imm8;
3385  for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
3386  r.u32[i] = a.u32[i] >> s;
3387  }
3388  return r;
3389 }
3390 #if defined(SIMDE_SSE2_NATIVE)
3391 #define simde_mm_srli_epi32(a, imm8) SIMDE__M128I_C(_mm_srli_epi32(a.n, imm8))
3392 #elif defined(SIMDE_SSE2_NEON)
3393 #define simde_mm_srli_epi32(a, imm8) \
3394  SIMDE__M128I_NEON_C( \
3395  u32, ((imm8) <= 0) \
3396  ? ((a).neon_u32) \
3397  : (((imm8) > 31) ? (vdupq_n_u32(0)) \
3398  : (vshrq_n_u32((a).neon_u32, \
3399  (imm8)))))
3400 #endif
3401 
3404 {
3405  simde__m128i r;
3406  const unsigned char s = imm8 & 255;
3408  for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
3409  if (s > 63) {
3410  r.u64[i] = 0;
3411  } else {
3412  r.u64[i] = a.u64[i] >> s;
3413  }
3414  }
3415  return r;
3416 }
3417 #if defined(SIMDE_SSE2_NATIVE)
3418 #define simde_mm_srli_epi64(a, imm8) SIMDE__M128I_C(_mm_srli_epi64(a.n, imm8))
3419 #elif defined(SIMDE_SSE2_NEON)
3420 #define simde_mm_srli_epi64(a, imm8) \
3421  SIMDE__M128I_NEON_C( \
3422  u64, \
3423  (((imm8)&255) < 0 || ((imm8)&255) > 63) \
3424  ? (vdupq_n_u64(0)) \
3425  : ((((imm8)&255) == 0) \
3426  ? (a.neon_u64) \
3427  : (vshrq_n_u64((a).neon_u64, (imm8)&255))))
3428 #endif
3429 
3432  simde__m128d a)
3433 {
3434  simde_assert_aligned(16, mem_addr);
3435 
3436 #if defined(SIMDE_SSE2_NATIVE)
3437  _mm_store_pd(mem_addr, a.n);
3438 #else
3439  SIMDE__ASSUME_ALIGNED(mem_addr, 16);
3440  memcpy(mem_addr, &a, sizeof(a));
3441 #endif
3442 }
3443 
3446  simde__m128d a)
3447 {
3448  simde_assert_aligned(16, mem_addr);
3449 
3450 #if defined(SIMDE_SSE2_NATIVE)
3451  _mm_store1_pd(mem_addr, a.n);
3452 #else
3453  SIMDE__ASSUME_ALIGNED(mem_addr, 16);
3454  mem_addr[0] = a.f64[0];
3455  mem_addr[1] = a.f64[0];
3456 #endif
3457 }
3458 #define simde_mm_store_pd1(mem_addr, a) simde_mm_store1_pd(mem_addr, a)
3459 
3462 {
3463 #if defined(SIMDE_SSE2_NATIVE)
3464  _mm_store_sd(mem_addr, a.n);
3465 #else
3466  memcpy(mem_addr, &a, sizeof(a.f64[0]));
3467 #endif
3468 }
3469 
3472 {
3473 #if defined(SIMDE_SSE2_NATIVE)
3474  _mm_store_si128(&mem_addr->n, a.n);
3475 #elif defined(SIMDE_SSE2_NEON)
3476  vst1q_s32((int32_t *)mem_addr, a.neon_i32);
3477 #else
3478  SIMDE__ASSUME_ALIGNED(mem_addr, 16);
3479  memcpy(mem_addr, &a, sizeof(a));
3480 #endif
3481 }
3482 
3485 {
3486 #if defined(SIMDE_SSE2_NATIVE)
3487  _mm_storeh_pd(mem_addr, a.n);
3488 #else
3489  *mem_addr = a.f64[1];
3490 #endif
3491 }
3492 
3495 {
3496 #if defined(SIMDE_SSE2_NATIVE)
3497  _mm_storel_epi64(&(mem_addr->n), a.n);
3498 #elif defined(SIMDE_SSE2_NEON)
3499  mem_addr->i64[0] = vgetq_lane_s64(a.neon_i64, 0);
3500 #else
3501  mem_addr->i64[0] = a.i64[0];
3502 #endif
3503 }
3504 
3507 {
3508 #if defined(SIMDE_SSE2_NATIVE)
3509  _mm_storel_pd(mem_addr, a.n);
3510 #else
3511  *mem_addr = a.f64[0];
3512 #endif
3513 }
3514 
3517 {
3518  simde_assert_aligned(16, mem_addr);
3519 
3520 #if defined(SIMDE_SSE2_NATIVE)
3521  _mm_storer_pd(mem_addr, a.n);
3522 #else
3523  SIMDE__ASSUME_ALIGNED(mem_addr, 16);
3524  mem_addr[0] = a.f64[1];
3525  mem_addr[1] = a.f64[0];
3526 #endif
3527 }
3528 
3531 {
3532 #if defined(SIMDE_SSE2_NATIVE)
3533  _mm_storeu_pd(mem_addr, a.n);
3534 #else
3535  memcpy(mem_addr, &a, sizeof(a));
3536 #endif
3537 }
3538 
3541 {
3542 #if defined(SIMDE_SSE2_NATIVE)
3543  _mm_storeu_si128(&mem_addr->n, a.n);
3544 #elif defined(SIMDE_SSE2_NEON)
3545  int32_t v[4];
3546  vst1q_s32(v, a.neon_i32);
3547  memcpy(mem_addr, v, sizeof(v));
3548 #else
3549  memcpy(mem_addr, &a, sizeof(a));
3550 #endif
3551 }
3552 
3555  simde__m128d a)
3556 {
3557 #if defined(SIMDE_SSE2_NATIVE)
3558  _mm_stream_pd(mem_addr, a.n);
3559 #else
3560  SIMDE__ASSUME_ALIGNED(mem_addr, 16);
3561  memcpy(mem_addr, &a, sizeof(a));
3562 #endif
3563 }
3564 
3567 {
3568 #if defined(SIMDE_SSE2_NATIVE)
3569  _mm_stream_si128(&mem_addr->n, a.n);
3570 #else
3571  SIMDE__ASSUME_ALIGNED(mem_addr, 16);
3572  memcpy(mem_addr, &a, sizeof(a));
3573 #endif
3574 }
3575 
3577 void simde_mm_stream_si32(int32_t *mem_addr, int32_t a)
3578 {
3579 #if defined(SIMDE_SSE2_NATIVE)
3580  _mm_stream_si32(mem_addr, a);
3581 #else
3582  *mem_addr = a;
3583 #endif
3584 }
3585 
3587 void simde_mm_stream_si64(int64_t *mem_addr, int64_t a)
3588 {
3589 #if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64)
3590 #if defined(SIMDE__REALLY_GCC) && !HEDLEY_GCC_VERSION_CHECK(4, 8, 0)
3591  *mem_addr = a;
3592 #elif defined(__GNUC__)
3593  _mm_stream_si64((long long *)mem_addr, a);
3594 #else
3595  _mm_stream_si64(mem_addr, a);
3596 #endif
3597 #else
3598  *mem_addr = a;
3599 #endif
3600 }
3601 
3604 {
3605 #if defined(SIMDE_SSE2_NATIVE)
3606  return SIMDE__M128I_C(_mm_sub_epi8(a.n, b.n));
3607 #elif defined(SIMDE_SSE2_NEON)
3608  return SIMDE__M128I_NEON_C(i8, vsubq_s8(a.neon_i8, b.neon_i8));
3609 #else
3610  simde__m128i r;
3612  for (size_t i = 0; i < (sizeof(r.i8) / sizeof(r.i8[0])); i++) {
3613  r.i8[i] = a.i8[i] - b.i8[i];
3614  }
3615  return r;
3616 #endif
3617 }
3618 
3621 {
3622 #if defined(SIMDE_SSE2_NATIVE)
3623  return SIMDE__M128I_C(_mm_sub_epi16(a.n, b.n));
3624 #elif defined(SIMDE_SSE2_NEON)
3625  return SIMDE__M128I_NEON_C(i16, vsubq_s16(a.neon_i16, b.neon_i16));
3626 #else
3627  simde__m128i r;
3629  for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
3630  r.i16[i] = a.i16[i] - b.i16[i];
3631  }
3632  return r;
3633 #endif
3634 }
3635 
3638 {
3639 #if defined(SIMDE_SSE2_NATIVE)
3640  return SIMDE__M128I_C(_mm_sub_epi32(a.n, b.n));
3641 #elif defined(SIMDE_SSE2_NEON)
3642  return SIMDE__M128I_NEON_C(i32, vsubq_s32(a.neon_i32, b.neon_i32));
3643 #else
3644  simde__m128i r;
3646  for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
3647  r.i32[i] = a.i32[i] - b.i32[i];
3648  }
3649  return r;
3650 #endif
3651 }
3652 
3655 {
3656 #if defined(SIMDE_SSE2_NATIVE)
3657  return SIMDE__M128I_C(_mm_sub_epi64(a.n, b.n));
3658 #elif defined(SIMDE_SSE2_NEON)
3659  return SIMDE__M128I_NEON_C(i64, vsubq_s64(a.neon_i64, b.neon_i64));
3660 #else
3661  simde__m128i r;
3663  for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
3664  r.i64[i] = a.i64[i] - b.i64[i];
3665  }
3666  return r;
3667 #endif
3668 }
3669 
3672 {
3673 #if defined(SIMDE_SSE2_NATIVE)
3674  return SIMDE__M128D_C(_mm_sub_pd(a.n, b.n));
3675 #else
3676  simde__m128d r;
3678  for (size_t i = 0; i < (sizeof(r.f64) / sizeof(r.f64[0])); i++) {
3679  r.f64[i] = a.f64[i] - b.f64[i];
3680  }
3681  return r;
3682 #endif
3683 }
3684 
3687 {
3688 #if defined(SIMDE_SSE2_NATIVE)
3689  return SIMDE__M128D_C(_mm_sub_sd(a.n, b.n));
3690 #else
3691  simde__m128d r;
3692  r.f64[0] = a.f64[0] - b.f64[0];
3693  r.f64[1] = a.f64[1];
3694  return r;
3695 #endif
3696 }
3697 
3700 {
3701 #if defined(SIMDE_SSE2_NATIVE)
3702  return SIMDE__M64_C(_mm_sub_si64(a.n, b.n));
3703 #else
3704  simde__m64 r;
3705  r.i64[0] = a.i64[0] - b.i64[0];
3706  return r;
3707 #endif
3708 }
3709 
3712 {
3713 #if defined(SIMDE_SSE2_NATIVE)
3714  return SIMDE__M128I_C(_mm_subs_epi8(a.n, b.n));
3715 #elif defined(SIMDE_SSE2_NEON)
3716  return SIMDE__M128I_NEON_C(i8, vqsubq_s8(a.neon_i8, b.neon_i8));
3717 #else
3718  simde__m128i r;
3720  for (size_t i = 0; i < (sizeof(r) / sizeof(r.i8[0])); i++) {
3721  if (((b.i8[i]) > 0 && (a.i8[i]) < INT8_MIN + (b.i8[i]))) {
3722  r.i8[i] = INT8_MIN;
3723  } else if ((b.i8[i]) < 0 && (a.i8[i]) > INT8_MAX + (b.i8[i])) {
3724  r.i8[i] = INT8_MAX;
3725  } else {
3726  r.i8[i] = (a.i8[i]) - (b.i8[i]);
3727  }
3728  }
3729  return r;
3730 #endif
3731 }
3732 
3735 {
3736 #if defined(SIMDE_SSE2_NATIVE)
3737  return SIMDE__M128I_C(_mm_subs_epi16(a.n, b.n));
3738 #elif defined(SIMDE_SSE2_NEON)
3739  return SIMDE__M128I_NEON_C(i16, vqsubq_s16(a.neon_i16, b.neon_i16));
3740 #else
3741  simde__m128i r;
3743  for (size_t i = 0; i < (sizeof(r) / sizeof(r.i16[0])); i++) {
3744  if (((b.i16[i]) > 0 && (a.i16[i]) < INT16_MIN + (b.i16[i]))) {
3745  r.i16[i] = INT16_MIN;
3746  } else if ((b.i16[i]) < 0 &&
3747  (a.i16[i]) > INT16_MAX + (b.i16[i])) {
3748  r.i16[i] = INT16_MAX;
3749  } else {
3750  r.i16[i] = (a.i16[i]) - (b.i16[i]);
3751  }
3752  }
3753  return r;
3754 #endif
3755 }
3756 
3759 {
3760 #if defined(SIMDE_SSE2_NATIVE)
3761  return SIMDE__M128I_C(_mm_subs_epu8(a.n, b.n));
3762 #elif defined(SIMDE_SSE2_NEON)
3763  return SIMDE__M128I_NEON_C(u8, vqsubq_u8(a.neon_u8, b.neon_u8));
3764 #else
3765  simde__m128i r;
3767  for (size_t i = 0; i < (sizeof(r) / sizeof(r.i8[0])); i++) {
3768  const int32_t x = a.u8[i] - b.u8[i];
3769  if (x < 0) {
3770  r.u8[i] = 0;
3771  } else if (x > UINT8_MAX) {
3772  r.u8[i] = UINT8_MAX;
3773  } else {
3774  r.u8[i] = (uint8_t)x;
3775  }
3776  }
3777  return r;
3778 #endif
3779 }
3780 
3783 {
3784 #if defined(SIMDE_SSE2_NATIVE)
3785  return SIMDE__M128I_C(_mm_subs_epu16(a.n, b.n));
3786 #elif defined(SIMDE_SSE2_NEON)
3787  return SIMDE__M128I_NEON_C(u16, vqsubq_u16(a.neon_u16, b.neon_u16));
3788 #else
3789  simde__m128i r;
3791  for (size_t i = 0; i < (sizeof(r) / sizeof(r.i16[0])); i++) {
3792  const int32_t x = a.u16[i] - b.u16[i];
3793  if (x < 0) {
3794  r.u16[i] = 0;
3795  } else if (x > UINT16_MAX) {
3796  r.u16[i] = UINT16_MAX;
3797  } else {
3798  r.u16[i] = (uint16_t)x;
3799  }
3800  }
3801  return r;
3802 #endif
3803 }
3804 
3807 {
3808 #if defined(SIMDE_SSE2_NATIVE)
3809  return _mm_ucomieq_sd(a.n, b.n);
3810 #else
3811  fenv_t envp;
3812  int x = feholdexcept(&envp);
3813  int r = a.f64[0] == b.f64[0];
3814  if (HEDLEY_LIKELY(x == 0))
3815  fesetenv(&envp);
3816  return r;
3817 #endif
3818 }
3819 
3822 {
3823 #if defined(SIMDE_SSE2_NATIVE)
3824  return _mm_ucomige_sd(a.n, b.n);
3825 #else
3826  fenv_t envp;
3827  int x = feholdexcept(&envp);
3828  int r = a.f64[0] >= b.f64[0];
3829  if (HEDLEY_LIKELY(x == 0))
3830  fesetenv(&envp);
3831  return r;
3832 #endif
3833 }
3834 
3837 {
3838 #if defined(SIMDE_SSE2_NATIVE)
3839  return _mm_ucomigt_sd(a.n, b.n);
3840 #else
3841  fenv_t envp;
3842  int x = feholdexcept(&envp);
3843  int r = a.f64[0] > b.f64[0];
3844  if (HEDLEY_LIKELY(x == 0))
3845  fesetenv(&envp);
3846  return r;
3847 #endif
3848 }
3849 
3852 {
3853 #if defined(SIMDE_SSE2_NATIVE)
3854  return _mm_ucomile_sd(a.n, b.n);
3855 #else
3856  fenv_t envp;
3857  int x = feholdexcept(&envp);
3858  int r = a.f64[0] <= b.f64[0];
3859  if (HEDLEY_LIKELY(x == 0))
3860  fesetenv(&envp);
3861  return r;
3862 #endif
3863 }
3864 
3867 {
3868 #if defined(SIMDE_SSE2_NATIVE)
3869  return _mm_ucomilt_sd(a.n, b.n);
3870 #else
3871  fenv_t envp;
3872  int x = feholdexcept(&envp);
3873  int r = a.f64[0] < b.f64[0];
3874  if (HEDLEY_LIKELY(x == 0))
3875  fesetenv(&envp);
3876  return r;
3877 #endif
3878 }
3879 
3882 {
3883 #if defined(SIMDE_SSE2_NATIVE)
3884  return _mm_ucomineq_sd(a.n, b.n);
3885 #else
3886  fenv_t envp;
3887  int x = feholdexcept(&envp);
3888  int r = a.f64[0] != b.f64[0];
3889  if (HEDLEY_LIKELY(x == 0))
3890  fesetenv(&envp);
3891  return r;
3892 #endif
3893 }
3894 
3897 {
3898  simde__m128d r;
3899 
3900 #if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE__HAVE_UNDEFINED128)
3901  r.n = _mm_undefined_pd();
3902 #else
3903  r = simde_mm_setzero_pd();
3904 #endif
3905 
3906  return r;
3907 }
3908 
3911 {
3912  simde__m128i r;
3913 
3914 #if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE__HAVE_UNDEFINED128)
3915  r.n = _mm_undefined_si128();
3916 #else
3917  r = simde_mm_setzero_si128();
3918 #endif
3919 
3920  return r;
3921 }
3922 
3925 {
3926 #if defined(SIMDE_SSE2_NATIVE)
3927  _mm_lfence();
3928 #else
3929  simde_mm_sfence();
3930 #endif
3931 }
3932 
3935 {
3936 #if defined(SIMDE_SSE2_NATIVE)
3937  _mm_mfence();
3938 #else
3939  simde_mm_sfence();
3940 #endif
3941 }
3942 
3945 {
3946 #if defined(SIMDE_SSE2_NATIVE)
3947  return SIMDE__M128I_C(_mm_unpackhi_epi8(a.n, b.n));
3948 #elif defined(SIMDE_SSE2_NEON)
3949  int8x8_t a1 = vreinterpret_s8_s16(vget_high_s16(a.neon_i16));
3950  int8x8_t b1 = vreinterpret_s8_s16(vget_high_s16(b.neon_i16));
3951  int8x8x2_t result = vzip_s8(a1, b1);
3952  return SIMDE__M128I_NEON_C(i8,
3953  vcombine_s8(result.val[0], result.val[1]));
3954 #else
3955  simde__m128i r;
3957  for (size_t i = 0; i < ((sizeof(r) / sizeof(r.i8[0])) / 2); i++) {
3958  r.i8[(i * 2)] = a.i8[i + ((sizeof(r) / sizeof(r.i8[0])) / 2)];
3959  r.i8[(i * 2) + 1] =
3960  b.i8[i + ((sizeof(r) / sizeof(r.i8[0])) / 2)];
3961  }
3962  return r;
3963 #endif
3964 }
3965 
3968 {
3969 #if defined(SIMDE_SSE2_NATIVE)
3970  return SIMDE__M128I_C(_mm_unpackhi_epi16(a.n, b.n));
3971 #elif defined(SIMDE_SSE2_NEON)
3972  int16x4_t a1 = vget_high_s16(a.neon_i16);
3973  int16x4_t b1 = vget_high_s16(b.neon_i16);
3974  int16x4x2_t result = vzip_s16(a1, b1);
3975  return SIMDE__M128I_NEON_C(i16,
3976  vcombine_s16(result.val[0], result.val[1]));
3977 #else
3978  simde__m128i r;
3980  for (size_t i = 0; i < ((sizeof(r) / sizeof(r.i16[0])) / 2); i++) {
3981  r.i16[(i * 2)] =
3982  a.i16[i + ((sizeof(r) / sizeof(r.i16[0])) / 2)];
3983  r.i16[(i * 2) + 1] =
3984  b.i16[i + ((sizeof(r) / sizeof(r.i16[0])) / 2)];
3985  }
3986  return r;
3987 #endif
3988 }
3989 
3992 {
3993 #if defined(SIMDE_SSE2_NATIVE)
3994  return SIMDE__M128I_C(_mm_unpackhi_epi32(a.n, b.n));
3995 #elif defined(SIMDE_SSE2_NEON)
3996  int32x2_t a1 = vget_high_s32(a.neon_i32);
3997  int32x2_t b1 = vget_high_s32(b.neon_i32);
3998  int32x2x2_t result = vzip_s32(a1, b1);
3999  return SIMDE__M128I_NEON_C(i32,
4000  vcombine_s32(result.val[0], result.val[1]));
4001 #else
4002  simde__m128i r;
4004  for (size_t i = 0; i < ((sizeof(r) / sizeof(r.i32[0])) / 2); i++) {
4005  r.i32[(i * 2)] =
4006  a.i32[i + ((sizeof(r) / sizeof(r.i32[0])) / 2)];
4007  r.i32[(i * 2) + 1] =
4008  b.i32[i + ((sizeof(r) / sizeof(r.i32[0])) / 2)];
4009  }
4010  return r;
4011 #endif
4012 }
4013 
4016 {
4017 #if defined(SIMDE_SSE2_NATIVE)
4018  return SIMDE__M128I_C(_mm_unpackhi_epi64(a.n, b.n));
4019 #else
4020  simde__m128i r;
4022  for (size_t i = 0; i < ((sizeof(r) / sizeof(r.i64[0])) / 2); i++) {
4023  r.i64[(i * 2)] =
4024  a.i64[i + ((sizeof(r) / sizeof(r.i64[0])) / 2)];
4025  r.i64[(i * 2) + 1] =
4026  b.i64[i + ((sizeof(r) / sizeof(r.i64[0])) / 2)];
4027  }
4028  return r;
4029 #endif
4030 }
4031 
4034 {
4035 #if defined(SIMDE_SSE2_NATIVE)
4036  return SIMDE__M128D_C(_mm_unpackhi_pd(a.n, b.n));
4037 #else
4038  simde__m128d r;
4040  for (size_t i = 0; i < ((sizeof(r) / sizeof(r.f64[0])) / 2); i++) {
4041  r.f64[(i * 2)] =
4042  a.f64[i + ((sizeof(r) / sizeof(r.f64[0])) / 2)];
4043  r.f64[(i * 2) + 1] =
4044  b.f64[i + ((sizeof(r) / sizeof(r.f64[0])) / 2)];
4045  }
4046  return r;
4047 #endif
4048 }
4049 
4052 {
4053 #if defined(SIMDE_SSE2_NATIVE)
4054  return SIMDE__M128I_C(_mm_unpacklo_epi8(a.n, b.n));
4055 #elif defined(SIMDE_SSE2_NEON)
4056  int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(a.neon_i16));
4057  int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(b.neon_i16));
4058  int8x8x2_t result = vzip_s8(a1, b1);
4059  return SIMDE__M128I_NEON_C(i8,
4060  vcombine_s8(result.val[0], result.val[1]));
4061 #else
4062  simde__m128i r;
4064  for (size_t i = 0; i < ((sizeof(r) / sizeof(r.i8[0])) / 2); i++) {
4065  r.i8[(i * 2)] = a.i8[i];
4066  r.i8[(i * 2) + 1] = b.i8[i];
4067  }
4068  return r;
4069 #endif
4070 }
4071 
4074 {
4075 #if defined(SIMDE_SSE2_NATIVE)
4076  return SIMDE__M128I_C(_mm_unpacklo_epi16(a.n, b.n));
4077 #elif defined(SIMDE_SSE2_NEON)
4078  int16x4_t a1 = vget_low_s16(a.neon_i16);
4079  int16x4_t b1 = vget_low_s16(b.neon_i16);
4080  int16x4x2_t result = vzip_s16(a1, b1);
4081  return SIMDE__M128I_NEON_C(i16,
4082  vcombine_s16(result.val[0], result.val[1]));
4083 #else
4084  simde__m128i r;
4086  for (size_t i = 0; i < ((sizeof(r) / sizeof(r.i16[0])) / 2); i++) {
4087  r.i16[(i * 2)] = a.i16[i];
4088  r.i16[(i * 2) + 1] = b.i16[i];
4089  }
4090  return r;
4091 #endif
4092 }
4093 
4096 {
4097 #if defined(SIMDE_SSE2_NATIVE)
4098  return SIMDE__M128I_C(_mm_unpacklo_epi32(a.n, b.n));
4099 #elif defined(SIMDE_SSE2_NEON)
4100  int32x2_t a1 = vget_low_s32(a.neon_i32);
4101  int32x2_t b1 = vget_low_s32(b.neon_i32);
4102  int32x2x2_t result = vzip_s32(a1, b1);
4103  return SIMDE__M128I_NEON_C(i32,
4104  vcombine_s32(result.val[0], result.val[1]));
4105 #else
4106  simde__m128i r;
4108  for (size_t i = 0; i < ((sizeof(r) / sizeof(r.i32[0])) / 2); i++) {
4109  r.i32[(i * 2)] = a.i32[i];
4110  r.i32[(i * 2) + 1] = b.i32[i];
4111  }
4112  return r;
4113 #endif
4114 }
4115 
4118 {
4119 #if defined(SIMDE_SSE2_NATIVE)
4120  return SIMDE__M128I_C(_mm_unpacklo_epi64(a.n, b.n));
4121 #else
4122  simde__m128i r;
4124  for (size_t i = 0; i < ((sizeof(r) / sizeof(r.i64[0])) / 2); i++) {
4125  r.i64[(i * 2)] = a.i64[i];
4126  r.i64[(i * 2) + 1] = b.i64[i];
4127  }
4128  return r;
4129 #endif
4130 }
4131 
4134 {
4135 #if defined(SIMDE_SSE2_NATIVE)
4136  return SIMDE__M128D_C(_mm_unpacklo_pd(a.n, b.n));
4137 #else
4138  simde__m128d r;
4140  for (size_t i = 0; i < ((sizeof(r) / sizeof(r.f64[0])) / 2); i++) {
4141  r.f64[(i * 2)] = a.f64[i];
4142  r.f64[(i * 2) + 1] = b.f64[i];
4143  }
4144  return r;
4145 #endif
4146 }
4147 
4150 {
4151 #if defined(SIMDE_SSE2_NATIVE)
4152  return SIMDE__M128D_C(_mm_xor_pd(a.n, b.n));
4153 #else
4154  simde__m128d r;
4156  for (size_t i = 0; i < (sizeof(r.i64) / sizeof(r.i64[0])); i++) {
4157  r.i64[i] = a.i64[i] ^ b.i64[i];
4158  }
4159  return r;
4160 #endif
4161 }
4162 
4165 {
4166 #if defined(SIMDE_SSE2_NATIVE)
4167  return SIMDE__M128I_C(_mm_xor_si128(a.n, b.n));
4168 #elif defined(SIMDE_SSE2_NEON)
4169  return SIMDE__M128I_NEON_C(i32, veorq_s32(a.neon_i32, b.neon_i32));
4170 #else
4171  simde__m128i r;
4173  for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
4174  r.i32[i] = a.i32[i] ^ b.i32[i];
4175  }
4176  return r;
4177 #endif
4178 }
4179 
4182 {
4183 #if defined(SIMDE_SSE2_NEON)
4184  return SIMDE__M128I_NEON_C(i32, vmvnq_s32(a.neon_i32));
4185 #else
4186  simde__m128i r;
4188  for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
4189  r.i32[i] = ~(a.i32[i]);
4190  }
4191  return r;
4192 #endif
4193 }
4194 
4196 
4197 #endif /* !defined(SIMDE__SSE2_H) */
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_undefined_pd(void)
Definition: sse2.h:3896
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_set1_pd(simde_float64 a)
Definition: sse2.h:2761
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_loadr_pd(simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)])
Definition: sse2.h:1781
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_unpacklo_epi32(simde__m128i a, simde__m128i b)
Definition: sse2.h:4095
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_mul_pd(simde__m128d a, simde__m128d b)
Definition: sse2.h:2174
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_stream_si32(int32_t *mem_addr, int32_t a)
Definition: sse2.h:3577
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_set_pd1(simde_float64 a)
Definition: sse2.h:2638
SIMDE_FLOAT32_TYPE simde_float32
Definition: simde-common.h:150
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_cvtsi32_sd(simde__m128d a, int32_t b)
Definition: sse2.h:1437
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_cvtsi64_si128(int64_t a)
Definition: sse2.h:1491
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_x_mm_set_epu16(uint16_t e7, uint16_t e6, uint16_t e5, uint16_t e4, uint16_t e3, uint16_t e2, uint16_t e1, uint16_t e0)
Definition: sse2.h:2579
SIMDE__FUNCTION_ATTRIBUTES simde__m64 simde_mm_sub_si64(simde__m64 a, simde__m64 b)
Definition: sse2.h:3699
#define SIMDE__ASSUME_ALIGNED(ptr, align)
Definition: simde-common.h:251
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_stream_si128(simde__m128i *mem_addr, simde__m128i a)
Definition: sse2.h:3566
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_srai_epi16(simde__m128i a, int imm8)
Definition: sse2.h:3180
SIMDE__FUNCTION_ATTRIBUTES int32_t simde_mm_movemask_epi8(simde__m128i a)
Definition: sse2.h:1875
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_sub_epi8(simde__m128i a, simde__m128i b)
Definition: sse2.h:3603
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_add_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:289
simde__m128
Definition: sse.h:124
SIMDE__FUNCTION_ATTRIBUTES int simde_mm_ucomige_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:3821
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_unpacklo_pd(simde__m128d a, simde__m128d b)
Definition: sse2.h:4133
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_andnot_pd(simde__m128d a, simde__m128d b)
Definition: sse2.h:437
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_add_pd(simde__m128d a, simde__m128d b)
Definition: sse2.h:272
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_adds_epi8(simde__m128i a, simde__m128i b)
Definition: sse2.h:316
#define HEDLEY_ARRAY_PARAM(name)
Definition: hedley.h:1309
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_cvtsi32_si128(int32_t a)
Definition: sse2.h:1452
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_max_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:2084
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_castsi128_pd(simde__m128i a)
Definition: sse2.h:719
SIMDE__BEGIN_DECLS typedef SIMDE_ALIGN(16) union
Definition: sse2.h:83
#define HEDLEY_UNLIKELY(expr)
Definition: hedley.h:1066
#define SIMDE__END_DECLS
Definition: simde-common.h:131
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_x_mm_not_si128(simde__m128i a)
Definition: sse2.h:4181
SIMDE__FUNCTION_ATTRIBUTES int32_t simde_mm_cvttsd_si32(simde__m128d a)
Definition: sse2.h:1576
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_sub_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:3686
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_cmpgt_epi32(simde__m128i a, simde__m128i b)
Definition: sse2.h:1016
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_sll_epi64(simde__m128i a, simde__m128i count)
Definition: sse2.h:3070
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_min_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:2014
#define SIMDE__VECTORIZE_REDUCTION(r)
Definition: simde-common.h:100
SIMDE__FUNCTION_ATTRIBUTES int64_t simde_mm_cvtsi128_si64(simde__m128i a)
Definition: sse2.h:1422
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_div_pd(simde__m128d a, simde__m128d b)
Definition: sse2.h:1601
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_castps_pd(simde__m128 a)
Definition: sse2.h:687
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_sra_epi32(simde__m128i a, simde__m128i count)
Definition: sse2.h:3261
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_cmpgt_epi8(simde__m128i a, simde__m128i b)
Definition: sse2.h:980
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_set1_epi64(simde__m64 a)
Definition: sse2.h:2744
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_sub_epi64(simde__m128i a, simde__m128i b)
Definition: sse2.h:3654
SIMDE__FUNCTION_ATTRIBUTES int simde_mm_comineq_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:647
SIMDE__FUNCTION_ATTRIBUTES int simde_mm_comilt_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:637
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmpnge_pd(simde__m128d a, simde__m128d b)
Definition: sse2.h:1090
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_add_epi64(simde__m128i a, simde__m128i b)
Definition: sse2.h:255
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_cmplt_epi16(simde__m128i a, simde__m128i b)
Definition: sse2.h:888
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_loadh_pd(simde__m128d a, simde_float64 const *mem_addr)
Definition: sse2.h:1730
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_add_epi32(simde__m128i a, simde__m128i b)
Definition: sse2.h:238
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_loadu_pd(simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)])
Definition: sse2.h:1800
int64_t i64[1]
Definition: mmx.h:69
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_madd_epi16(simde__m128i a, simde__m128i b)
Definition: sse2.h:1834
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_store_si128(simde__m128i *mem_addr, simde__m128i a)
Definition: sse2.h:3471
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_cvtepi32_pd(simde__m128i a)
Definition: sse2.h:1210
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_sqrt_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:3107
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_loadl_pd(simde__m128d a, simde_float64 const *mem_addr)
Definition: sse2.h:1765
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmpnge_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:1100
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_castpd_si128(simde__m128d a)
Definition: sse2.h:672
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_or_si128(simde__m128i a, simde__m128i b)
Definition: sse2.h:2305
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_storel_pd(simde_float64 *mem_addr, simde__m128d a)
Definition: sse2.h:3506
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_store1_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128d a)
Definition: sse2.h:3445
simde__m128i
Definition: sse2.h:132
SIMDE__FUNCTION_ATTRIBUTES int simde_mm_ucomineq_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:3881
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmpnle_pd(simde__m128d a, simde__m128d b)
Definition: sse2.h:1130
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_clflush(void const *p)
Definition: sse2.h:587
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_set1_epi16(int16_t a)
Definition: sse2.h:2687
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_cmplt_epi32(simde__m128i a, simde__m128i b)
Definition: sse2.h:906
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_store_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128d a)
Definition: sse2.h:3431
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmplt_pd(simde__m128d a, simde__m128d b)
Definition: sse2.h:924
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_x_mm_mod_epi64(simde__m128i a, simde__m128i b)
Definition: sse2.h:2161
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_unpackhi_epi32(simde__m128i a, simde__m128i b)
Definition: sse2.h:3991
Definition: half.h:49
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_xor_pd(simde__m128d a, simde__m128d b)
Definition: sse2.h:4149
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_cvtpi32_pd(simde__m64 a)
Definition: sse2.h:1287
#define HEDLEY_LIKELY(expr)
Definition: hedley.h:1065
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_mullo_epi16(simde__m128i a, simde__m128i b)
Definition: sse2.h:2267
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmpgt_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:1049
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_storeu_si128(simde__m128i *mem_addr, simde__m128i a)
Definition: sse2.h:3540
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_avg_epu8(simde__m128i a, simde__m128i b)
Definition: sse2.h:471
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_packs_epi32(simde__m128i a, simde__m128i b)
Definition: sse2.h:2352
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_x_mm_set_epu64x(uint64_t e1, uint64_t e0)
Definition: sse2.h:2612
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_subs_epu16(simde__m128i a, simde__m128i b)
Definition: sse2.h:3782
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_shufflehi_epi16(simde__m128i a, const int imm8)
Definition: sse2.h:2973
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_unpacklo_epi64(simde__m128i a, simde__m128i b)
Definition: sse2.h:4117
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_avg_epu16(simde__m128i a, simde__m128i b)
Definition: sse2.h:488
SIMDE__FUNCTION_ATTRIBUTES int32_t simde_mm_cvtsi128_si32(simde__m128i a)
Definition: sse2.h:1410
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_cvtps_epi32(simde__m128 a)
Definition: sse2.h:1302
SIMDE__FUNCTION_ATTRIBUTES simde__m64 simde_mm_cvttpd_pi32(simde__m128d a)
Definition: sse2.h:1542
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_movpi64_epi64(simde__m64 a)
Definition: sse2.h:1944
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_cvttps_epi32(simde__m128 a)
Definition: sse2.h:1558
SIMDE__FUNCTION_ATTRIBUTES int32_t simde_mm_cvtsd_si64(simde__m128d a)
Definition: sse2.h:1376
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_undefined_si128(void)
Definition: sse2.h:3910
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_cmpeq_epi16(simde__m128i a, simde__m128i b)
Definition: sse2.h:769
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmpneq_pd(simde__m128d a, simde__m128d b)
Definition: sse2.h:838
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_setzero_si128(void)
Definition: sse2.h:2907
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_setr_epi32(int32_t e3, int32_t e2, int32_t e1, int32_t e0)
Definition: sse2.h:2840
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_x_mm_set_epu8(uint8_t e15, uint8_t e14, uint8_t e13, uint8_t e12, uint8_t e11, uint8_t e10, uint8_t e9, uint8_t e8, uint8_t e7, uint8_t e6, uint8_t e5, uint8_t e4, uint8_t e3, uint8_t e2, uint8_t e1, uint8_t e0)
Definition: sse2.h:2550
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_cvtss_sd(simde__m128d a, simde__m128 b)
Definition: sse2.h:1511
SIMDE__FUNCTION_ATTRIBUTES int32_t simde_mm_movemask_pd(simde__m128d a)
Definition: sse2.h:1915
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_cmpeq_epi8(simde__m128i a, simde__m128i b)
Definition: sse2.h:751
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_x_mm_set_epu32(uint32_t e3, uint32_t e2, uint32_t e1, uint32_t e0)
Definition: sse2.h:2598
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_sfence(void)
Definition: sse.h:2048
Definition: mmx.h:54
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_mulhi_epu16(simde__m128i a, simde__m128i b)
Definition: sse2.h:2249
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_mul_epu32(simde__m128i a, simde__m128i b)
Definition: sse2.h:2131
uint32_t u32[2]
Definition: mmx.h:72
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cvtsd_ss(simde__m128 a, simde__m128d b)
Definition: sse2.h:1391
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_unpackhi_epi8(simde__m128i a, simde__m128i b)
Definition: sse2.h:3944
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cvtpd_ps(simde__m128d a)
Definition: sse2.h:1272
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_div_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:1618
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_store_sd(simde_float64 *mem_addr, simde__m128d a)
Definition: sse2.h:3461
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_set_epi16(int16_t e7, int16_t e6, int16_t e5, int16_t e4, int16_t e3, int16_t e2, int16_t e1, int16_t e0)
Definition: sse2.h:2472
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_move_epi64(simde__m128i a)
Definition: sse2.h:2099
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_adds_epu16(simde__m128i a, simde__m128i b)
Definition: sse2.h:384
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_set1_epi32(int32_t a)
Definition: sse2.h:2706
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_unpacklo_epi8(simde__m128i a, simde__m128i b)
Definition: sse2.h:4051
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_bslli_si128(simde__m128i a, const int imm8)
Definition: sse2.h:505
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_sub_pd(simde__m128d a, simde__m128d b)
Definition: sse2.h:3671
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_castsi128_ps(simde__m128i a)
Definition: sse2.h:734
int32_t i32[2]
Definition: mmx.h:68
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_subs_epu8(simde__m128i a, simde__m128i b)
Definition: sse2.h:3758
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_setr_epi16(int16_t e7, int16_t e6, int16_t e5, int16_t e4, int16_t e3, int16_t e2, int16_t e1, int16_t e0)
Definition: sse2.h:2815
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_storel_epi64(simde__m128i *mem_addr, simde__m128i a)
Definition: sse2.h:3494
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_sub_epi16(simde__m128i a, simde__m128i b)
Definition: sse2.h:3620
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmpunord_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:1196
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmpord_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:1166
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_packs_epi16(simde__m128i a, simde__m128i b)
Definition: sse2.h:2324
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_subs_epi8(simde__m128i a, simde__m128i b)
Definition: sse2.h:3711
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_packus_epi16(simde__m128i a, simde__m128i b)
Definition: sse2.h:2381
SIMDE__FUNCTION_ATTRIBUTES simde__m64 simde_mm_cvtpd_pi32(simde__m128d a)
Definition: sse2.h:1257
#define SIMDE__BEGIN_DECLS
Definition: simde-common.h:130
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmpgt_pd(simde__m128d a, simde__m128d b)
Definition: sse2.h:1034
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmpord_pd(simde__m128d a, simde__m128d b)
Definition: sse2.h:1150
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_max_epi16(simde__m128i a, simde__m128i b)
Definition: sse2.h:2029
#define SIMDE__FUNCTION_ATTRIBUTES
Definition: simde-common.h:121
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_add_epi16(simde__m128i a, simde__m128i b)
Definition: sse2.h:221
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_slli_epi32(simde__m128i a, const int imm8)
Definition: sse2.h:3315
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmpge_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:1077
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_sra_epi16(simde__m128i a, simde__m128i count)
Definition: sse2.h:3233
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmpge_pd(simde__m128d a, simde__m128d b)
Definition: sse2.h:1062
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_stream_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128d a)
Definition: sse2.h:3554
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_bsrli_si128(simde__m128i a, const int imm8)
Definition: sse2.h:546
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_slli_epi64(simde__m128i a, const int imm8)
Definition: sse2.h:3339
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_set_epi8(int8_t e15, int8_t e14, int8_t e13, int8_t e12, int8_t e11, int8_t e10, int8_t e9, int8_t e8, int8_t e7, int8_t e6, int8_t e5, int8_t e4, int8_t e3, int8_t e2, int8_t e1, int8_t e0)
Definition: sse2.h:2439
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_min_epu8(simde__m128i a, simde__m128i b)
Definition: sse2.h:1978
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_cvttpd_epi32(simde__m128d a)
Definition: sse2.h:1526
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_srai_epi32(simde__m128i a, int imm8)
Definition: sse2.h:3202
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmpeq_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:825
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_srli_epi64(simde__m128i a, const int imm8)
Definition: sse2.h:3403
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmpnlt_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:1120
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_sqrt_pd(simde__m128d a)
Definition: sse2.h:3090
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_cvtpd_epi32(simde__m128d a)
Definition: sse2.h:1242
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_unpackhi_epi16(simde__m128i a, simde__m128i b)
Definition: sse2.h:3967
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_srl_epi64(simde__m128i a, simde__m128i count)
Definition: sse2.h:3160
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_srl_epi16(simde__m128i a, simde__m128i count)
Definition: sse2.h:3120
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmpneq_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:857
#define simde_assert_aligned(alignment, val)
Definition: simde-common.h:50
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmple_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:967
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_subs_epi16(simde__m128i a, simde__m128i b)
Definition: sse2.h:3734
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_load_sd(simde_float64 const *mem_addr)
Definition: sse2.h:1696
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_xor_si128(simde__m128i a, simde__m128i b)
Definition: sse2.h:4164
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_load_pd1(simde_float64 const *mem_addr)
Definition: sse2.h:1680
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_andnot_si128(simde__m128i a, simde__m128i b)
Definition: sse2.h:454
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_mulhi_epi16(simde__m128i a, simde__m128i b)
Definition: sse2.h:2220
SIMDE__FUNCTION_ATTRIBUTES int32_t simde_mm_cvtsd_si32(simde__m128d a)
Definition: sse2.h:1366
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_unpackhi_pd(simde__m128d a, simde__m128d b)
Definition: sse2.h:4033
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_and_si128(simde__m128i a, simde__m128i b)
Definition: sse2.h:420
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_cmpgt_epi16(simde__m128i a, simde__m128i b)
Definition: sse2.h:998
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_min_epi16(simde__m128i a, simde__m128i b)
Definition: sse2.h:1959
SIMDE__FUNCTION_ATTRIBUTES int32_t simde_mm_extract_epi16(simde__m128i a, const int imm8)
Definition: sse2.h:1633
SIMDE__FUNCTION_ATTRIBUTES int simde_mm_comige_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:607
SIMDE__FUNCTION_ATTRIBUTES double simde_mm_cvtsd_f64(simde__m128d a)
Definition: sse2.h:1356
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_set_sd(simde_float64 a)
Definition: sse2.h:2653
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmpnlt_pd(simde__m128d a, simde__m128d b)
Definition: sse2.h:1110
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_storer_pd(simde_float64 mem_addr[2], simde__m128d a)
Definition: sse2.h:3516
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_maskmoveu_si128(simde__m128i a, simde__m128i mask, int8_t mem_addr[HEDLEY_ARRAY_PARAM(16)])
Definition: sse2.h:1860
SIMDE__FUNCTION_ATTRIBUTES int simde_mm_ucomilt_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:3866
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_pause(void)
Definition: sse2.h:2407
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_slli_epi16(simde__m128i a, const int imm8)
Definition: sse2.h:3291
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_lfence(void)
Definition: sse2.h:3924
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_castps_si128(simde__m128 a)
Definition: sse2.h:702
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_adds_epu8(simde__m128i a, simde__m128i b)
Definition: sse2.h:365
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_setr_epi8(int8_t e15, int8_t e14, int8_t e13, int8_t e12, int8_t e11, int8_t e10, int8_t e9, int8_t e8, int8_t e7, int8_t e6, int8_t e5, int8_t e4, int8_t e3, int8_t e2, int8_t e1, int8_t e0)
Definition: sse2.h:2778
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmpnle_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:1140
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_srl_epi32(simde__m128i a, simde__m128i count)
Definition: sse2.h:3140
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_cvtps_pd(simde__m128 a)
Definition: sse2.h:1341
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmpeq_pd(simde__m128d a, simde__m128d b)
Definition: sse2.h:805
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_unpacklo_epi16(simde__m128i a, simde__m128i b)
Definition: sse2.h:4073
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_insert_epi16(simde__m128i a, int32_t i, const int imm8)
Definition: sse2.h:1646
SIMDE__FUNCTION_ATTRIBUTES int simde_mm_comieq_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:597
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_setr_epi64(simde__m64 e1, simde__m64 e0)
Definition: sse2.h:2860
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_add_epi8(simde__m128i a, simde__m128i b)
Definition: sse2.h:204
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmpunord_pd(simde__m128d a, simde__m128d b)
Definition: sse2.h:1180
simde__m128d
Definition: sse2.h:175
SIMDE__FUNCTION_ATTRIBUTES simde__m64 simde_mm_add_si64(simde__m64 a, simde__m64 b)
Definition: sse2.h:302
uint64_t u64[1]
Definition: mmx.h:73
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_shuffle_pd(simde__m128d a, simde__m128d b, const int imm8)
Definition: sse2.h:2950
HEDLEY_STATIC_ASSERT(16==sizeof(simde__m128i), "simde__m128i size incorrect")
SIMDE__FUNCTION_ATTRIBUTES int simde_mm_ucomieq_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:3806
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_load_pd(simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)])
Definition: sse2.h:1661
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_cvtsi64_sd(simde__m128d a, int32_t b)
Definition: sse2.h:1471
#define SIMDE__VECTORIZE
Definition: simde-common.h:98
SIMDE__FUNCTION_ATTRIBUTES simde__m64 simde_mm_mul_su32(simde__m64 a, simde__m64 b)
Definition: sse2.h:2206
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_set1_epi8(int8_t a)
Definition: sse2.h:2668
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_min_pd(simde__m128d a, simde__m128d b)
Definition: sse2.h:1997
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_max_epu8(simde__m128i a, simde__m128i b)
Definition: sse2.h:2048
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_cmpeq_epi32(simde__m128i a, simde__m128i b)
Definition: sse2.h:787
SIMDE__FUNCTION_ATTRIBUTES int simde_mm_ucomile_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:3851
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_shuffle_epi32(simde__m128i a, const int imm8)
Definition: sse2.h:2924
SIMDE__FUNCTION_ATTRIBUTES int simde_mm_comile_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:627
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_and_pd(simde__m128d a, simde__m128d b)
Definition: sse2.h:403
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_storeu_pd(simde_float64 *mem_addr, simde__m128d a)
Definition: sse2.h:3530
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_load_si128(simde__m128i const *mem_addr)
Definition: sse2.h:1711
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_adds_epi16(simde__m128i a, simde__m128i b)
Definition: sse2.h:340
SIMDE__FUNCTION_ATTRIBUTES int64_t simde_mm_cvttsd_si64(simde__m128d a)
Definition: sse2.h:1586
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_loadu_si128(simde__m128i const *mem_addr)
Definition: sse2.h:1818
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_setr_pd(simde_float64 e1, simde_float64 e0)
Definition: sse2.h:2877
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_set1_epi64x(int64_t a)
Definition: sse2.h:2725
SIMDE_FLOAT64_TYPE simde_float64
Definition: simde-common.h:160
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_sll_epi32(simde__m128i a, simde__m128i count)
Definition: sse2.h:3050
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cvtepi32_ps(simde__m128i a)
Definition: sse2.h:1225
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_srli_epi16(simde__m128i a, const int imm8)
Definition: sse2.h:3355
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_move_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:2116
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_castpd_ps(simde__m128d a)
Definition: sse2.h:657
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_set_pd(simde_float64 e1, simde_float64 e0)
Definition: sse2.h:2623
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_set_epi64x(int64_t e1, int64_t e0)
Definition: sse2.h:2532
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmplt_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:939
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_mul_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:2191
SIMDE__FUNCTION_ATTRIBUTES int simde_mm_comigt_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:617
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_cmplt_epi8(simde__m128i a, simde__m128i b)
Definition: sse2.h:870
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmple_pd(simde__m128d a, simde__m128d b)
Definition: sse2.h:952
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_mfence(void)
Definition: sse2.h:3934
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_set_epi64(simde__m64 e1, simde__m64 e0)
Definition: sse2.h:2517
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_unpackhi_epi64(simde__m128i a, simde__m128i b)
Definition: sse2.h:4015
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_srli_epi32(simde__m128i a, const int imm8)
Definition: sse2.h:3379
SIMDE__FUNCTION_ATTRIBUTES simde__m64 simde_mm_movepi64_pi64(simde__m128i a)
Definition: sse2.h:1930
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_sub_epi32(simde__m128i a, simde__m128i b)
Definition: sse2.h:3637
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_sad_epu8(simde__m128i a, simde__m128i b)
Definition: sse2.h:2415
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_storeh_pd(simde_float64 *mem_addr, simde__m128d a)
Definition: sse2.h:3484
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_x_mm_mul_epi64(simde__m128i a, simde__m128i b)
Definition: sse2.h:2148
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_max_pd(simde__m128d a, simde__m128d b)
Definition: sse2.h:2067
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_shufflelo_epi16(simde__m128i a, const int imm8)
Definition: sse2.h:3002
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_setzero_pd(void)
Definition: sse2.h:2892
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_loadl_epi64(simde__m128i const *mem_addr)
Definition: sse2.h:1747
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_set_epi32(int32_t e3, int32_t e2, int32_t e1, int32_t e0)
Definition: sse2.h:2497
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_sll_epi16(simde__m128i a, simde__m128i count)
Definition: sse2.h:3030
SIMDE__FUNCTION_ATTRIBUTES int simde_mm_ucomigt_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:3836
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_stream_si64(int64_t *mem_addr, int64_t a)
Definition: sse2.h:3587
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_or_pd(simde__m128d a, simde__m128d b)
Definition: sse2.h:2288