30 #if !defined(SIMDE__SSE2_H) 31 #if !defined(SIMDE__SSE2_H) 36 #if defined(SIMDE_SSE2_NATIVE) 37 #undef SIMDE_SSE2_NATIVE 39 #if defined(SIMDE_SSE2_FORCE_NATIVE) 40 #define SIMDE_SSE2_NATIVE 41 #elif defined(__SSE2__) && !defined(SIMDE_SSE2_NO_NATIVE) && \ 42 !defined(SIMDE_NO_NATIVE) 43 #define SIMDE_SSE2_NATIVE 44 #elif defined(__ARM_NEON) && !defined(SIMDE_SSE2_NO_NEON) && \ 45 !defined(SIMDE_NO_NEON) 46 #define SIMDE_SSE2_NEON 49 #if defined(SIMDE_SSE2_NATIVE) && !defined(SIMDE_SSE_NATIVE) 50 #if defined(SIMDE_SSE2_FORCE_NATIVE) 51 #error Native SSE2 support requires native SSE support 53 #warning Native SSE2 support requires native SSE support, disabling 54 #undef SIMDE_SSE2_NATIVE 56 #elif defined(SIMDE_SSE2_NEON) && !defined(SIMDE_SSE_NEON) 57 #warning SSE2 NEON support requires SSE NEON support, disabling 61 #if defined(SIMDE_SSE2_NATIVE) 62 #include <emmintrin.h> 64 #if defined(SIMDE_SSE2_NEON) 73 #define vreinterpretq_m128i_s32(v) \ 74 (simde__m128i) { .neon_i32 = v } 75 #define vreinterpretq_m128i_u64(v) \ 76 (simde__m128i) { .neon_u64 = v } 78 #define vreinterpretq_s32_m128i(a) a.neon_i32 79 #define vreinterpretq_u64_m128i(a) a.neon_u64 84 #if defined(SIMDE__ENABLE_GCC_VEC_EXT) 85 int8_t i8 __attribute__((__vector_size__(16), __may_alias__));
86 int16_t i16 __attribute__((__vector_size__(16), __may_alias__));
87 int32_t i32 __attribute__((__vector_size__(16), __may_alias__));
88 int64_t i64 __attribute__((__vector_size__(16), __may_alias__));
89 uint8_t u8 __attribute__((__vector_size__(16), __may_alias__));
90 uint16_t u16 __attribute__((__vector_size__(16), __may_alias__));
91 uint32_t u32 __attribute__((__vector_size__(16), __may_alias__));
92 uint64_t u64 __attribute__((__vector_size__(16), __may_alias__));
93 #if defined(SIMDE__HAVE_INT128) 94 simde_int128 i128 __attribute__((__vector_size__(16), __may_alias__));
95 simde_uint128 u128 __attribute__((__vector_size__(16), __may_alias__));
97 simde_float32 f32 __attribute__((__vector_size__(16), __may_alias__));
98 simde_float64 f64 __attribute__((__vector_size__(16), __may_alias__));
108 #if defined(SIMDE__HAVE_INT128) 109 simde_int128 i128[1];
110 simde_uint128 u128[1];
116 #if defined(SIMDE_SSE2_NATIVE) 118 #elif defined(SIMDE_SSE2_NEON) 127 float32x4_t neon_f32;
128 #if defined(SIMDE_ARCH_AMD64) 129 float64x2_t neon_f64;
135 #if defined(SIMDE__ENABLE_GCC_VEC_EXT) 136 int8_t i8 __attribute__((__vector_size__(16), __may_alias__));
137 int16_t i16 __attribute__((__vector_size__(16), __may_alias__));
138 int32_t i32 __attribute__((__vector_size__(16), __may_alias__));
139 int64_t i64 __attribute__((__vector_size__(16), __may_alias__));
140 uint8_t u8 __attribute__((__vector_size__(16), __may_alias__));
141 uint16_t u16 __attribute__((__vector_size__(16), __may_alias__));
142 uint32_t u32 __attribute__((__vector_size__(16), __may_alias__));
143 uint64_t u64 __attribute__((__vector_size__(16), __may_alias__));
144 simde_float32 f32 __attribute__((__vector_size__(16), __may_alias__));
145 simde_float64 f64 __attribute__((__vector_size__(16), __may_alias__));
159 #if defined(SIMDE_SSE2_NATIVE) 161 #elif defined(SIMDE_SSE2_NEON) 170 float32x4_t neon_f32;
171 #if defined(SIMDE_ARCH_AMD64) 172 float64x2_t neon_f64;
177 #if defined(SIMDE_SSE2_NATIVE) 179 "__m128i size doesn't match simde__m128i size");
181 "__m128d size doesn't match simde__m128d size");
194 #elif defined(SIMDE_SSE_NEON) 195 #define SIMDE__M128I_NEON_C(T, expr) \ 196 (simde__m128i) { .neon_##T = expr } 197 #define SIMDE__M128D_NEON_C(T, expr) \ 198 (simde__m128d) { .neon_##T = expr } 206 #if defined(SIMDE_SSE2_NATIVE) 207 return SIMDE__M128I_C(_mm_add_epi8(a.n, b.n));
208 #elif defined(SIMDE_SSE2_NEON) 209 return SIMDE__M128I_NEON_C(i8, vaddq_s8(a.neon_i8, b.neon_i8));
213 for (
size_t i = 0; i < (
sizeof(r.i8) /
sizeof(r.i8[0])); i++) {
214 r.i8[i] = a.i8[i] + b.i8[i];
223 #if defined(SIMDE_SSE2_NATIVE) 224 return SIMDE__M128I_C(_mm_add_epi16(a.n, b.n));
225 #elif defined(SIMDE_SSE2_NEON) 226 return SIMDE__M128I_NEON_C(i16, vaddq_s16(a.neon_i16, b.neon_i16));
230 for (
size_t i = 0; i < (
sizeof(r.i16) /
sizeof(r.i16[0])); i++) {
231 r.i16[i] = a.i16[i] + b.i16[i];
240 #if defined(SIMDE_SSE2_NATIVE) 241 return SIMDE__M128I_C(_mm_add_epi32(a.n, b.n));
242 #elif defined(SIMDE_SSE2_NEON) 243 return SIMDE__M128I_NEON_C(i32, vaddq_s32(a.neon_i32, b.neon_i32));
247 for (
size_t i = 0; i < (
sizeof(r.i32) /
sizeof(r.i32[0])); i++) {
248 r.i32[i] = a.i32[i] + b.i32[i];
257 #if defined(SIMDE_SSE2_NATIVE) 258 return SIMDE__M128I_C(_mm_add_epi64(a.n, b.n));
259 #elif defined(SIMDE_SSE2_NEON) 260 return SIMDE__M128I_NEON_C(i64, vaddq_s64(a.neon_i64, b.neon_i64));
264 for (
size_t i = 0; i < (
sizeof(r.i64) /
sizeof(r.i64[0])); i++) {
265 r.i64[i] = a.i64[i] + b.i64[i];
274 #if defined(SIMDE_SSE2_NATIVE) 275 return SIMDE__M128D_C(_mm_add_pd(a.n, b.n));
276 #elif defined(SIMDE_SSE2_NEON) && defined(SIMDE_ARCH_AMD64) 277 return SIMDE__M128I_NEON_C(f64, vaddq_f64(a.neon_f64, b.neon_f64));
281 for (
size_t i = 0; i < (
sizeof(r.f64) /
sizeof(r.f64[0])); i++) {
282 r.f64[i] = a.f64[i] + b.f64[i];
291 #if defined(SIMDE_SSE2_NATIVE) 292 return SIMDE__M128D_C(_mm_add_sd(a.n, b.n));
295 r.f64[0] = a.f64[0] + b.f64[0];
304 #if defined(SIMDE_SSE2_NATIVE) 305 return SIMDE__M64_C(_mm_add_si64(a.n, b.n));
306 #elif defined(SIMDE_SSE2_NEON) 307 return SIMDE__M64_NEON_C(i64, vadd_s64(a.neon_i64, b.neon_i64));
318 #if defined(SIMDE_SSE2_NATIVE) 319 return SIMDE__M128I_C(_mm_adds_epi8(a.n, b.n));
320 #elif defined(SIMDE_SSE2_NEON) 321 return SIMDE__M128I_NEON_C(i8, vqaddq_s8(a.neon_i8, b.neon_i8));
325 for (
size_t i = 0; i < (
sizeof(r.i8) /
sizeof(r.i8[0])); i++) {
326 if ((((b.i8[i]) > 0) && ((a.i8[i]) > (INT8_MAX - (b.i8[i]))))) {
328 }
else if ((((b.i8[i]) < 0) &&
329 ((a.i8[i]) < (INT8_MIN - (b.i8[i]))))) {
332 r.i8[i] = (a.i8[i]) + (b.i8[i]);
342 #if defined(SIMDE_SSE2_NATIVE) 343 return SIMDE__M128I_C(_mm_adds_epi16(a.n, b.n));
344 #elif defined(SIMDE_SSE2_NEON) 345 return SIMDE__M128I_NEON_C(i16, vqaddq_s16(a.neon_i16, b.neon_i16));
349 for (
size_t i = 0; i < (
sizeof(r.i16) /
sizeof(r.i16[0])); i++) {
350 if ((((b.i16[i]) > 0) &&
351 ((a.i16[i]) > (INT16_MAX - (b.i16[i]))))) {
352 r.i16[i] = INT16_MAX;
353 }
else if ((((b.i16[i]) < 0) &&
354 ((a.i16[i]) < (INT16_MIN - (b.i16[i]))))) {
355 r.i16[i] = INT16_MIN;
357 r.i16[i] = (a.i16[i]) + (b.i16[i]);
367 #if defined(SIMDE_SSE2_NATIVE) 368 return SIMDE__M128I_C(_mm_adds_epu8(a.n, b.n));
369 #elif defined(SIMDE_SSE2_NEON) 370 return SIMDE__M128I_NEON_C(u8, vqaddq_u8(a.neon_u8, b.neon_u8));
374 for (
size_t i = 0; i < (
sizeof(r.u8) /
sizeof(r.u8[0])); i++) {
375 r.u8[i] = ((UINT8_MAX - a.u8[i]) > b.u8[i])
376 ? (a.u8[i] + b.u8[i])
386 #if defined(SIMDE_SSE2_NATIVE) 387 return SIMDE__M128I_C(_mm_adds_epu16(a.n, b.n));
388 #elif defined(SIMDE_SSE2_NEON) 389 return SIMDE__M128I_NEON_C(u16, vqaddq_u16(a.neon_u16, b.neon_u16));
393 for (
size_t i = 0; i < (
sizeof(r.u16) /
sizeof(r.u16[0])); i++) {
394 r.u16[i] = ((UINT16_MAX - a.u16[i]) > b.u16[i])
395 ? (a.u16[i] + b.u16[i])
405 #if defined(SIMDE_SSE2_NATIVE) 406 return SIMDE__M128D_C(_mm_and_pd(a.n, b.n));
407 #elif defined(SIMDE_SSE2_NEON) 408 return SIMDE__M128D_NEON_C(i32, vandq_s32(a.neon_i32, b.neon_i32));
412 for (
size_t i = 0; i < (
sizeof(r.u64) /
sizeof(r.u64[0])); i++) {
413 r.u64[i] = a.u64[i] & b.u64[i];
422 #if defined(SIMDE_SSE2_NATIVE) 423 return SIMDE__M128I_C(_mm_and_si128(a.n, b.n));
424 #elif defined(SIMDE_SSE_NEON) 425 return SIMDE__M128I_NEON_C(i32, vandq_s32(b.neon_i32, a.neon_i32));
429 for (
size_t i = 0; i < (
sizeof(r.i64) /
sizeof(r.i64[0])); i++) {
430 r.i64[i] = a.i64[i] & b.i64[i];
439 #if defined(SIMDE_SSE2_NATIVE) 440 return SIMDE__M128D_C(_mm_andnot_pd(a.n, b.n));
441 #elif defined(SIMDE_SSE2_NEON) 442 return SIMDE__M128D_NEON_C(i32, vbicq_s32(a.neon_i32, b.neon_i32));
446 for (
size_t i = 0; i < (
sizeof(r.u64) /
sizeof(r.u64[0])); i++) {
447 r.u64[i] = ~a.u64[i] & b.u64[i];
456 #if defined(SIMDE_SSE2_NATIVE) 457 return SIMDE__M128I_C(_mm_andnot_si128(a.n, b.n));
458 #elif defined(SIMDE_SSE2_NEON) 459 return SIMDE__M128I_NEON_C(i32, vbicq_s32(b.neon_i32, a.neon_i32));
463 for (
size_t i = 0; i < (
sizeof(r.i64) /
sizeof(r.i64[0])); i++) {
464 r.i64[i] = ~(a.i64[i]) & b.i64[i];
473 #if defined(SIMDE_SSE2_NATIVE) 474 return SIMDE__M128I_C(_mm_avg_epu8(a.n, b.n));
475 #elif defined(SIMDE_SSE2_NEON) 476 return SIMDE__M128I_NEON_C(u8, vrhaddq_u8(b.neon_u8, a.neon_u8));
480 for (
size_t i = 0; i < (
sizeof(r.u8) /
sizeof(r.u8[0])); i++) {
481 r.u8[i] = (a.u8[i] + b.u8[i] + 1) >> 1;
490 #if defined(SIMDE_SSE2_NATIVE) 491 return SIMDE__M128I_C(_mm_avg_epu16(a.n, b.n));
492 #elif defined(SIMDE_SSE2_NEON) 493 return SIMDE__M128I_NEON_C(u16, vrhaddq_u16(b.neon_u16, a.neon_u16));
497 for (
size_t i = 0; i < (
sizeof(r.u16) /
sizeof(r.u16[0])); i++) {
498 r.u16[i] = (a.u16[i] + b.u16[i] + 1) >> 1;
515 const int s = imm8 * 8;
517 #if defined(SIMDE__HAVE_INT128) 518 r.u128[0] = a.u128[0] << s;
521 r.u64[0] = (a.u64[0] << s);
522 r.u64[1] = (a.u64[1] << s) | (a.u64[0] >> (64 - s));
525 r.u64[1] = a.u64[0] << (s - 64);
531 #if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI) 532 #define simde_mm_bslli_si128(a, imm8) SIMDE__M128I_C(_mm_slli_si128(a.n, imm8)) 533 #elif defined(SIMDE_SSE2_NEON) 534 #define simde_mm_bslli_si128(a, imm8) \ 535 SIMDE__M128I_NEON_C( \ 537 (((imm8) <= 0) ? ((a).neon_i8) \ 538 : (((imm8) > 15) ? (vdupq_n_s8(0)) \ 539 : (vextq_s8(vdupq_n_s8(0), \ 543 #define simde_mm_slli_si128(a, imm8) simde_mm_bslli_si128(a, imm8) 556 const int s = imm8 * 8;
558 #if defined(SIMDE__HAVE_INT128) 559 r.u128[0] = a.u128[0] >> s;
562 r.u64[0] = (a.u64[0] >> s) | (a.u64[1] << (64 - s));
563 r.u64[1] = (a.u64[1] >> s);
565 r.u64[0] = a.u64[1] >> (s - 64);
572 #if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI) 573 #define simde_mm_bsrli_si128(a, imm8) SIMDE__M128I_C(_mm_srli_si128(a.n, imm8)) 574 #elif defined(SIMDE_SSE2_NEON) 575 #define simde_mm_bsrli_si128(a, imm8) \ 576 SIMDE__M128I_NEON_C( \ 580 : (((imm8) > 15) ? (vdupq_n_s8(0)) \ 581 : (vextq_s8((a).neon_i8, \ 582 vdupq_n_s8(0), (imm8))))) 584 #define simde_mm_srli_si128(a, imm8) simde_mm_bsrli_si128(a, imm8) 589 #if defined(SIMDE_SSE2_NATIVE) 599 #if defined(SIMDE_SSE2_NATIVE) 600 return _mm_comieq_sd(a.n, b.n);
602 return a.f64[0] == b.f64[0];
609 #if defined(SIMDE_SSE2_NATIVE) 610 return _mm_comige_sd(a.n, b.n);
612 return a.f64[0] >= b.f64[0];
619 #if defined(SIMDE_SSE2_NATIVE) 620 return _mm_comigt_sd(a.n, b.n);
622 return a.f64[0] > b.f64[0];
629 #if defined(SIMDE_SSE2_NATIVE) 630 return _mm_comile_sd(a.n, b.n);
632 return a.f64[0] <= b.f64[0];
639 #if defined(SIMDE_SSE2_NATIVE) 640 return _mm_comilt_sd(a.n, b.n);
642 return a.f64[0] < b.f64[0];
649 #if defined(SIMDE_SSE2_NATIVE) 650 return _mm_comineq_sd(a.n, b.n);
652 return a.f64[0] != b.f64[0];
659 #if defined(SIMDE_SSE2_NATIVE) 660 return SIMDE__M128_C(_mm_castpd_ps(a.n));
674 #if defined(SIMDE_SSE2_NATIVE) 675 return SIMDE__M128I_C(_mm_castpd_si128(a.n));
689 #if defined(SIMDE_SSE2_NATIVE) 690 return SIMDE__M128D_C(_mm_castps_pd(a.n));
704 #if defined(SIMDE_SSE2_NATIVE) 705 return SIMDE__M128I_C(_mm_castps_si128(a.n));
706 #elif defined(SIMDE_SSE2_NEON) 707 return SIMDE__M128I_NEON_C(i32, a.neon_i32);
721 #if defined(SIMDE_SSE2_NATIVE) 722 return SIMDE__M128D_C(_mm_castsi128_pd(a.n));
736 #if defined(SIMDE_SSE2_NATIVE) 737 return SIMDE__M128_C(_mm_castsi128_ps(a.n));
738 #elif defined(SIMDE_SSE2_NEON) 739 return SIMDE__M128_NEON_C(f32, a.neon_f32);
753 #if defined(SIMDE_SSE2_NATIVE) 754 return SIMDE__M128I_C(_mm_cmpeq_epi8(a.n, b.n));
755 #elif defined(SIMDE_SSE2_NEON) 756 return SIMDE__M128I_NEON_C(
757 i8, vreinterpretq_s8_u8(vceqq_s8(b.neon_i8, a.neon_i8)));
761 for (
size_t i = 0; i < (
sizeof(r.i8) /
sizeof(r.i8[0])); i++) {
762 r.i8[i] = (a.i8[i] == b.i8[i]) ? 0xff : 0x00;
771 #if defined(SIMDE_SSE2_NATIVE) 772 return SIMDE__M128I_C(_mm_cmpeq_epi16(a.n, b.n));
773 #elif defined(SIMDE_SSE2_NEON) 774 return SIMDE__M128I_NEON_C(
775 i16, vreinterpretq_s16_u16(vceqq_s16(b.neon_i16, a.neon_i16)));
779 for (
size_t i = 0; i < (
sizeof(r.i16) /
sizeof(r.i16[0])); i++) {
780 r.i16[i] = (a.i16[i] == b.i16[i]) ? 0xffff : 0x0000;
789 #if defined(SIMDE_SSE2_NATIVE) 790 return SIMDE__M128I_C(_mm_cmpeq_epi32(a.n, b.n));
791 #elif defined(SIMDE_SSE2_NEON) 792 return SIMDE__M128I_NEON_C(
793 i32, vreinterpretq_s32_u32(vceqq_s32(b.neon_i32, a.neon_i32)));
797 for (
size_t i = 0; i < (
sizeof(r.i32) /
sizeof(r.i32[0])); i++) {
798 r.i32[i] = (a.i32[i] == b.i32[i]) ? 0xffffffff : 0x00000000;
807 #if defined(SIMDE_SSE2_NATIVE) 808 return SIMDE__M128D_C(_mm_cmpeq_pd(a.n, b.n));
809 #elif defined(SIMDE_SSE2_NEON) 810 return SIMDE__M128D_NEON_C(
811 i32, vreinterpretq_s32_u32(
812 vceqq_s32(vreinterpretq_s32_f32(b.neon_f32),
813 vreinterpretq_s32_f32(a.neon_f32))));
817 for (
size_t i = 0; i < (
sizeof(r.f64) /
sizeof(r.f64[0])); i++) {
818 r.u64[i] = (a.f64[i] == b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
827 #if defined(SIMDE_SSE2_NATIVE) 828 return SIMDE__M128D_C(_mm_cmpeq_sd(a.n, b.n));
831 r.u64[0] = (a.f64[0] == b.f64[0]) ? ~UINT64_C(0) : 0;
840 #if defined(SIMDE_SSE2_NATIVE) 841 return SIMDE__M128D_C(_mm_cmpneq_pd(a.n, b.n));
842 #elif defined(SIMDE_SSE2_NEON) 843 return SIMDE__M128D_NEON_C(f32,
844 vreinterpretq_f32_u16(vmvnq_u16(
845 vceqq_s16(b.neon_i16, a.neon_i16))));
849 for (
size_t i = 0; i < (
sizeof(r.f64) /
sizeof(r.f64[0])); i++) {
850 r.u64[i] = (a.f64[i] != b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
859 #if defined(SIMDE_SSE2_NATIVE) 860 return SIMDE__M128D_C(_mm_cmpneq_sd(a.n, b.n));
863 r.u64[0] = (a.f64[0] != b.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
872 #if defined(SIMDE_SSE2_NATIVE) 873 return SIMDE__M128I_C(_mm_cmplt_epi8(a.n, b.n));
874 #elif defined(SIMDE_SSE2_NEON) 875 return SIMDE__M128I_NEON_C(
876 i8, vreinterpretq_s8_u8(vcltq_s8(a.neon_i8, b.neon_i8)));
880 for (
size_t i = 0; i < (
sizeof(r.i8) /
sizeof(r.i8[0])); i++) {
881 r.i8[i] = (a.i8[i] < b.i8[i]) ? 0xff : 0x00;
890 #if defined(SIMDE_SSE2_NATIVE) 891 return SIMDE__M128I_C(_mm_cmplt_epi16(a.n, b.n));
892 #elif defined(SIMDE_SSE2_NEON) 893 return SIMDE__M128I_NEON_C(
894 i16, vreinterpretq_s16_u16(vcltq_s16(a.neon_i16, b.neon_i16)));
898 for (
size_t i = 0; i < (
sizeof(r.i16) /
sizeof(r.i16[0])); i++) {
899 r.i16[i] = (a.i16[i] < b.i16[i]) ? 0xffff : 0x0000;
908 #if defined(SIMDE_SSE2_NATIVE) 909 return SIMDE__M128I_C(_mm_cmplt_epi32(a.n, b.n));
910 #elif defined(SIMDE_SSE2_NEON) 911 return SIMDE__M128I_NEON_C(
912 i32, vreinterpretq_s32_u32(vcltq_s32(a.neon_i32, b.neon_i32)));
916 for (
size_t i = 0; i < (
sizeof(r.i32) /
sizeof(r.i32[0])); i++) {
917 r.i32[i] = (a.i32[i] < b.i32[i]) ? 0xffffffff : 0x00000000;
926 #if defined(SIMDE_SSE2_NATIVE) 927 return SIMDE__M128D_C(_mm_cmplt_pd(a.n, b.n));
931 for (
size_t i = 0; i < (
sizeof(r.f64) /
sizeof(r.f64[0])); i++) {
932 r.u64[i] = (a.f64[i] < b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
941 #if defined(SIMDE_SSE2_NATIVE) 942 return SIMDE__M128D_C(_mm_cmplt_sd(a.n, b.n));
945 r.u64[0] = (a.f64[0] < b.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
954 #if defined(SIMDE_SSE2_NATIVE) 955 return SIMDE__M128D_C(_mm_cmple_pd(a.n, b.n));
959 for (
size_t i = 0; i < (
sizeof(r.f64) /
sizeof(r.f64[0])); i++) {
960 r.u64[i] = (a.f64[i] <= b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
969 #if defined(SIMDE_SSE2_NATIVE) 970 return SIMDE__M128D_C(_mm_cmple_sd(a.n, b.n));
973 r.u64[0] = (a.f64[0] <= b.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
982 #if defined(SIMDE_SSE2_NATIVE) 983 return SIMDE__M128I_C(_mm_cmpgt_epi8(a.n, b.n));
984 #elif defined(SIMDE_SSE2_NEON) 985 return SIMDE__M128I_NEON_C(
986 i8, vreinterpretq_s8_u8(vcgtq_s8(a.neon_i8, b.neon_i8)));
990 for (
size_t i = 0; i < (
sizeof(r.i8) /
sizeof(r.i8[0])); i++) {
991 r.i8[i] = (a.i8[i] > b.i8[i]) ? 0xff : 0x00;
1000 #if defined(SIMDE_SSE2_NATIVE) 1001 return SIMDE__M128I_C(_mm_cmpgt_epi16(a.n, b.n));
1002 #elif defined(SIMDE_SSE2_NEON) 1003 return SIMDE__M128I_NEON_C(
1004 i16, vreinterpretq_s16_u16(vcgtq_s16(a.neon_i16, b.neon_i16)));
1008 for (
size_t i = 0; i < (
sizeof(r.i16) /
sizeof(r.i16[0])); i++) {
1009 r.i16[i] = (a.i16[i] > b.i16[i]) ? 0xffff : 0x0000;
1018 #if defined(SIMDE_SSE2_NATIVE) 1019 return SIMDE__M128I_C(_mm_cmpgt_epi32(a.n, b.n));
1020 #elif defined(SIMDE_SSE2_NEON) 1021 return SIMDE__M128I_NEON_C(
1022 i32, vreinterpretq_s32_u32(vcgtq_s32(a.neon_i32, b.neon_i32)));
1026 for (
size_t i = 0; i < (
sizeof(r.i32) /
sizeof(r.i32[0])); i++) {
1027 r.i32[i] = (a.i32[i] > b.i32[i]) ? 0xffffffff : 0x00000000;
1036 #if defined(SIMDE_SSE2_NATIVE) 1037 return SIMDE__M128D_C(_mm_cmpgt_pd(a.n, b.n));
1041 for (
size_t i = 0; i < (
sizeof(r.f64) /
sizeof(r.f64[0])); i++) {
1042 r.u64[i] = (a.f64[i] > b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
1051 #if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI) 1052 return SIMDE__M128D_C(_mm_cmpgt_sd(a.n, b.n));
1055 r.u64[0] = (a.f64[0] > b.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
1056 r.u64[1] = a.u64[1];
1064 #if defined(SIMDE_SSE2_NATIVE) 1065 return SIMDE__M128D_C(_mm_cmpge_pd(a.n, b.n));
1069 for (
size_t i = 0; i < (
sizeof(r.f64) /
sizeof(r.f64[0])); i++) {
1070 r.u64[i] = (a.f64[i] >= b.f64[i]) ? ~UINT64_C(0) : UINT64_C(0);
1079 #if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI) 1080 return SIMDE__M128D_C(_mm_cmpge_sd(a.n, b.n));
1083 r.u64[0] = (a.f64[0] >= b.f64[0]) ? ~UINT64_C(0) : UINT64_C(0);
1084 r.u64[1] = a.u64[1];
1092 #if defined(SIMDE_SSE2_NATIVE) 1093 return SIMDE__M128D_C(_mm_cmpnge_pd(a.n, b.n));
1102 #if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI) 1103 return SIMDE__M128D_C(_mm_cmpnge_sd(a.n, b.n));
1112 #if defined(SIMDE_SSE2_NATIVE) 1113 return SIMDE__M128D_C(_mm_cmpnlt_pd(a.n, b.n));
1122 #if defined(SIMDE_SSE2_NATIVE) 1123 return SIMDE__M128D_C(_mm_cmpnlt_sd(a.n, b.n));
1132 #if defined(SIMDE_SSE2_NATIVE) 1133 return SIMDE__M128D_C(_mm_cmpnle_pd(a.n, b.n));
1142 #if defined(SIMDE_SSE2_NATIVE) 1143 return SIMDE__M128D_C(_mm_cmpnle_sd(a.n, b.n));
1152 #if defined(SIMDE_SSE2_NATIVE) 1153 return SIMDE__M128D_C(_mm_cmpord_pd(a.n, b.n));
1157 for (
size_t i = 0; i < (
sizeof(r.f64) /
sizeof(r.f64[0])); i++) {
1158 r.u64[i] = (!isnan(a.f64[i]) && !isnan(b.f64[i])) ? ~UINT64_C(0)
1168 #if defined(SIMDE_SSE2_NATIVE) 1169 return SIMDE__M128D_C(_mm_cmpord_sd(a.n, b.n));
1172 r.u64[0] = (!isnan(a.f64[0]) && !isnan(b.f64[0])) ? ~UINT64_C(0)
1174 r.u64[1] = a.u64[1];
1182 #if defined(SIMDE_SSE2_NATIVE) 1183 return SIMDE__M128D_C(_mm_cmpunord_pd(a.n, b.n));
1187 for (
size_t i = 0; i < (
sizeof(r.f64) /
sizeof(r.f64[0])); i++) {
1188 r.u64[i] = (isnan(a.f64[i]) || isnan(b.f64[i])) ? ~UINT64_C(0)
1198 #if defined(SIMDE_SSE2_NATIVE) 1199 return SIMDE__M128D_C(_mm_cmpunord_sd(a.n, b.n));
1202 r.u64[0] = (isnan(a.f64[0]) || isnan(b.f64[0])) ? ~UINT64_C(0)
1204 r.u64[1] = a.u64[1];
1212 #if defined(SIMDE_SSE2_NATIVE) 1213 return SIMDE__M128D_C(_mm_cvtepi32_pd(a.n));
1217 for (
size_t i = 0; i < (
sizeof(r.f64) /
sizeof(r.f64[0])); i++) {
1227 #if defined(SIMDE_SSE2_NATIVE) 1228 return SIMDE__M128_C(_mm_cvtepi32_ps(a.n));
1229 #elif defined(SIMDE_SSE2_NEON) 1230 return SIMDE__M128_NEON_C(f32, vcvtq_f32_s32(a.neon_i32));
1234 for (
size_t i = 0; i < (
sizeof(r.f32) /
sizeof(r.f32[0])); i++) {
1244 #if defined(SIMDE_SSE2_NATIVE) 1245 return SIMDE__M128I_C(_mm_cvtpd_epi32(a.n));
1249 for (
size_t i = 0; i < (
sizeof(r.f64) /
sizeof(r.f64[0])); i++) {
1250 r.i32[i] = (int32_t)a.f64[i];
1259 #if defined(SIMDE_SSE2_NATIVE) 1260 return SIMDE__M64_C(_mm_cvtpd_pi32(a.n));
1264 for (
size_t i = 0; i < (
sizeof(r.
i32) /
sizeof(r.
i32[0])); i++) {
1265 r.
i32[i] = (int32_t)a.f64[i];
1274 #if defined(SIMDE_SSE2_NATIVE) 1275 return SIMDE__M128_C(_mm_cvtpd_ps(a.n));
1279 for (
size_t i = 0; i < (
sizeof(a.f64) /
sizeof(a.f64[0])); i++) {
1289 #if defined(SIMDE_SSE2_NATIVE) 1290 return SIMDE__M128D_C(_mm_cvtpi32_pd(a.n));
1294 for (
size_t i = 0; i < (
sizeof(r.f64) /
sizeof(r.f64[0])); i++) {
1304 #if defined(SIMDE_SSE2_NATIVE) 1305 return SIMDE__M128I_C(_mm_cvtps_epi32(a.n));
1306 #elif defined(SIMDE_SSE2_NEON) 1309 #if defined(SIMDE_ARCH_AARCH64) 1310 return SIMDE__M128I_NEON_C(i32, vcvtnq_s32_f32(a.neon_f32));
1312 uint32x4_t signmask = vdupq_n_u32(0x80000000);
1313 float32x4_t
half = vbslq_f32(signmask, a.neon_f32,
1315 int32x4_t r_normal = vcvtq_s32_f32(
1316 vaddq_f32(a.neon_f32,
half));
1318 vcvtq_s32_f32(a.neon_f32);
1319 int32x4_t plusone = vshrq_n_s32(vnegq_s32(r_trunc), 31);
1320 int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
1322 float32x4_t delta = vsubq_f32(
1324 vcvtq_f32_s32(r_trunc));
1325 uint32x4_t is_delta_half =
1326 vceqq_f32(delta,
half);
1327 return SIMDE__M128I_NEON_C(i32,
1328 vbslq_s32(is_delta_half, r_even, r_normal));
1333 for (
size_t i = 0; i < (
sizeof(r.i32) /
sizeof(r.i32[0])); i++) {
1334 r.i32[i] = (int32_t)a.f32[i];
1343 #if defined(SIMDE_SSE2_NATIVE) 1344 return SIMDE__M128D_C(_mm_cvtps_pd(a.n));
1348 for (
size_t i = 0; i < (
sizeof(r.f64) /
sizeof(r.f64[0])); i++) {
1349 r.f64[i] = a.f32[i];
1358 #if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI) 1359 return _mm_cvtsd_f64(a.n);
1368 #if defined(SIMDE_SSE2_NATIVE) 1369 return _mm_cvtsd_si32(a.n);
1371 return (int32_t)a.f64[0];
1378 #if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64) 1380 return _mm_cvtsd_si64x(a.n);
1382 return _mm_cvtsd_si64(a.n);
1385 return (int32_t)a.f64[0];
1388 #define simde_mm_cvtsd_si64x(a) simde_mm_cvtsd_si64(a) 1393 #if defined(SIMDE_SSE2_NATIVE) 1394 return SIMDE__M128_C(_mm_cvtsd_ss(a.n, b.n));
1401 for (
size_t i = 1; i < (
sizeof(r) /
sizeof(r.i32[0])); i++) {
1402 r.i32[i] = a.i32[i];
1412 #if defined(SIMDE_SSE2_NATIVE) 1413 return _mm_cvtsi128_si32(a.n);
1414 #elif defined(SIMDE_SSE2_NEON) 1415 return vgetq_lane_s32(a.neon_i32, 0);
1424 #if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64) 1426 return _mm_cvtsi128_si64x(a.n);
1428 return _mm_cvtsi128_si64(a.n);
1434 #define simde_mm_cvtsi128_si64x(a) simde_mm_cvtsi128_si64(a) 1439 #if defined(SIMDE_SSE2_NATIVE) 1440 return SIMDE__M128D_C(_mm_cvtsi32_sd(a.n, b));
1445 r.i64[1] = a.i64[1];
1456 #if defined(SIMDE_SSE2_NATIVE) 1457 r.n = _mm_cvtsi32_si128(a);
1458 #elif defined(SIMDE_SSE2_NEON) 1459 r.neon_i32 = vsetq_lane_s32(a, vdupq_n_s32(0), 0);
1475 #if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64) 1477 r.n = _mm_cvtsi64_sd(a.n, b);
1479 r.n = _mm_cvtsi64x_sd(a.n, b);
1483 r.f64[1] = a.f64[1];
1488 #define simde_mm_cvtsi64x_sd(a, b) simde_mm_cvtsi64(a, b) 1495 #if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64) 1497 r.n = _mm_cvtsi64_si128(a);
1499 r.n = _mm_cvtsi64x_si128(a);
1508 #define simde_mm_cvtsi64x_si128(a) simde_mm_cvtsi64_si128(a) 1515 #if defined(SIMDE_SSE2_NATIVE) 1516 r.n = _mm_cvtss_sd(a.n, b.n);
1518 r.f64[0] = b.f32[0];
1519 r.i64[1] = a.i64[1];
1530 #if defined(SIMDE_SSE2_NATIVE) 1531 r.n = _mm_cvttpd_epi32(a.n);
1533 for (
size_t i = 0; i < (
sizeof(a.f64) /
sizeof(a.f64[0])); i++) {
1534 r.i32[i] = (int32_t)trunc(a.f64[i]);
1546 #if defined(SIMDE_SSE2_NATIVE) 1547 r.n = _mm_cvttpd_pi32(a.n);
1549 for (
size_t i = 0; i < (
sizeof(r.
i32) /
sizeof(r.
i32[0])); i++) {
1550 r.
i32[i] = (int32_t)trunc(a.f64[i]);
1562 #if defined(SIMDE_SSE2_NATIVE) 1563 r.n = _mm_cvttps_epi32(a.n);
1564 #elif defined(SIMDE_SSE2_NEON) 1565 r.neon_i32 = vcvtq_s32_f32(a.neon_f32);
1567 for (
size_t i = 0; i < (
sizeof(r.i32) /
sizeof(r.i32[0])); i++) {
1568 r.i32[i] = (int32_t)truncf(a.f32[i]);
1578 #if defined(SIMDE_SSE2_NATIVE) 1579 return _mm_cvttsd_si32(a.n);
1581 return (int32_t)trunc(a.f64[0]);
1588 #if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64) 1590 return _mm_cvttsd_si64(a.n);
1592 return _mm_cvttsd_si64x(a.n);
1595 return (int64_t)trunc(a.f64[0]);
1598 #define simde_mm_cvttsd_si64x(a) simde_mm_cvttsd_si64(a) 1605 #if defined(SIMDE_SSE2_NATIVE) 1606 r.n = _mm_div_pd(a.n, b.n);
1609 for (
size_t i = 0; i < (
sizeof(r.f64) /
sizeof(r.f64[0])); i++) {
1610 r.f64[i] = a.f64[i] / b.f64[i];
1622 #if defined(SIMDE_SSE2_NATIVE) 1623 r.n = _mm_div_sd(a.n, b.n);
1625 r.f64[0] = a.f64[0] / b.f64[0];
1626 r.f64[1] = a.f64[1];
1635 return a.u16[imm8 & 7];
1637 #if defined(SIMDE_SSE2_NATIVE) && \ 1638 (!defined(SIMDE__REALLY_GCC) || HEDLEY_GCC_VERSION_CHECK(4, 6, 0)) 1639 #define simde_mm_extract_epi16(a, imm8) _mm_extract_epi16(a.n, imm8) 1640 #elif defined(SIMDE_SSE2_NEON) 1641 #define simde_mm_extract_epi16(a, imm8) \ 1642 (vgetq_lane_s16((a).neon_i16, (imm8)) & ((int32_t)UINT32_C(0x0000ffff))) 1648 a.u16[imm8 & 7] = (int16_t)i;
1651 #if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI) 1652 #define simde_mm_insert_epi16(a, i, imm8) \ 1653 SIMDE__M128I_C(_mm_insert_epi16((a).n, (i), (imm8))) 1654 #elif defined(SIMDE_SSE2_NEON) 1655 #define simde_mm_insert_epi16(a, i, imm8) \ 1656 SIMDE__M128I_NEON_C(i16, vsetq_lane_s16((i), a.neon_i16, (imm8))) 1667 #if defined(SIMDE_SSE2_NATIVE) 1668 r.n = _mm_load_pd(mem_addr);
1669 #elif defined(SIMDE_SSE2_NEON) 1670 r.neon_u32 = vld1q_u32((uint32_t
const *)mem_addr);
1673 memcpy(&r, mem_addr,
sizeof(r));
1684 #if defined(SIMDE_SSE2_NATIVE) 1685 r.n = _mm_load_pd1(mem_addr);
1687 r.f64[0] = *mem_addr;
1688 r.f64[1] = *mem_addr;
1693 #define simde_mm_load1_pd(mem_addr) simde_mm_load_pd1(mem_addr) 1700 #if defined(SIMDE_SSE2_NATIVE) 1701 r.n = _mm_load_sd(mem_addr);
1717 #if defined(SIMDE_SSE2_NATIVE) 1718 r.n = _mm_load_si128(&(mem_addr->n));
1719 #elif defined(SIMDE_SSE2_NEON) 1720 r.neon_i32 = vld1q_s32((int32_t
const *)mem_addr);
1723 memcpy(&r, mem_addr,
sizeof(r));
1734 #if defined(SIMDE_SSE2_NATIVE) 1735 r.n = _mm_loadh_pd(a.n, mem_addr);
1738 memcpy(&t, mem_addr,
sizeof(t));
1739 r.f64[0] = a.f64[0];
1751 #if defined(SIMDE_SSE2_NATIVE) 1752 r.n = _mm_loadl_epi64(&mem_addr->n);
1753 #elif defined(SIMDE_SSE2_NEON) 1754 r.neon_i32 = vcombine_s32(vld1_s32((int32_t
const *)mem_addr),
1757 r.u64[0] = mem_addr->u64[0];
1769 #if defined(SIMDE_SSE2_NATIVE) 1770 r.n = _mm_loadl_pd(a.n, mem_addr);
1773 r.u64[1] = a.u64[1];
1787 #if defined(SIMDE_SSE2_NATIVE) 1788 r.n = _mm_loadr_pd(mem_addr);
1791 r.f64[0] = mem_addr[1];
1792 r.f64[1] = mem_addr[0];
1804 #if defined(SIMDE_SSE2_NATIVE) 1805 r.n = _mm_loadu_pd(mem_addr);
1808 memcpy(&l, &mem_addr[0],
sizeof(l));
1809 memcpy(&h, &mem_addr[1],
sizeof(h));
1822 #if defined(SIMDE_SSE2_NATIVE) 1823 r.n = _mm_loadu_si128(&((*mem_addr).n));
1824 #elif defined(SIMDE_SSE2_NEON) 1825 r.neon_i32 = vld1q_s32((int32_t
const *)mem_addr);
1827 memcpy(&r, mem_addr,
sizeof(r));
1838 #if defined(SIMDE_SSE2_NATIVE) 1839 r.n = _mm_madd_epi16(a.n, b.n);
1840 #elif defined(SIMDE_SSE2_NEON) 1842 vmull_s16(vget_low_s16(a.neon_i16), vget_low_s16(b.neon_i16));
1844 vmull_s16(vget_high_s16(a.neon_i16), vget_high_s16(b.neon_i16));
1845 int32x2_t rl = vpadd_s32(vget_low_s32(pl), vget_high_s32(pl));
1846 int32x2_t rh = vpadd_s32(vget_low_s32(ph), vget_high_s32(ph));
1847 r.neon_i32 = vcombine_s32(rl, rh);
1850 for (
size_t i = 0; i < (
sizeof(r) /
sizeof(r.i16[0])); i += 2) {
1852 (a.i16[i] * b.i16[i]) + (a.i16[i + 1] * b.i16[i + 1]);
1863 #if defined(SIMDE_SSE2_NATIVE) 1864 _mm_maskmoveu_si128(a.n, mask.n, (
char *)mem_addr);
1866 for (
size_t i = 0; i < 16; i++) {
1867 if (mask.u8[i] & 0x80) {
1868 mem_addr[i] = a.i8[i];
1877 #if defined(SIMDE_SSE2_NATIVE) 1878 return _mm_movemask_epi8(a.n);
1879 #elif defined(SIMDE_SSE2_NEON) 1880 uint8x16_t input = a.neon_u8;
1882 static const int8_t xr[8] = {-7, -6, -5, -4, -3, -2, -1, 0};
1883 uint8x8_t mask_and = vdup_n_u8(0x80);
1884 int8x8_t mask_shift = vld1_s8(xr);
1886 uint8x8_t lo = vget_low_u8(input);
1887 uint8x8_t hi = vget_high_u8(input);
1889 lo = vand_u8(lo, mask_and);
1890 lo = vshl_u8(lo, mask_shift);
1892 hi = vand_u8(hi, mask_and);
1893 hi = vshl_u8(hi, mask_shift);
1895 lo = vpadd_u8(lo, lo);
1896 lo = vpadd_u8(lo, lo);
1897 lo = vpadd_u8(lo, lo);
1899 hi = vpadd_u8(hi, hi);
1900 hi = vpadd_u8(hi, hi);
1901 hi = vpadd_u8(hi, hi);
1903 return ((hi[0] << 8) | (lo[0] & 0xFF));
1907 for (
size_t i = 0; i < 16; i++) {
1908 r |= (a.u8[15 - i] >> 7) << (15 - i);
1917 #if defined(SIMDE_SSE2_NATIVE) 1918 return _mm_movemask_pd(a.n);
1922 for (
size_t i = 0; i < (
sizeof(a.u64) /
sizeof(a.u64[0])); i++) {
1923 r |= (a.u64[i] >> 63) << i;
1934 #if defined(SIMDE_SSE2_NATIVE) 1935 r.n = _mm_movepi64_pi64(a.n);
1937 r.
i64[0] = a.i64[0];
1948 #if defined(SIMDE_SSE2_NATIVE) 1949 r.n = _mm_movpi64_epi64(a.n);
1951 r.i64[0] = a.
i64[0];
1963 #if defined(SIMDE_SSE2_NATIVE) 1964 r.n = _mm_min_epi16(a.n, b.n);
1965 #elif defined(SIMDE_SSE2_NEON) 1966 r.neon_i16 = vminq_s16(a.neon_i16, b.neon_i16);
1969 for (
size_t i = 0; i < (
sizeof(r.i16) /
sizeof(r.i16[0])); i++) {
1970 r.i16[i] = (a.i16[i] < b.i16[i]) ? a.i16[i] : b.i16[i];
1982 #if defined(SIMDE_SSE2_NATIVE) 1983 r.n = _mm_min_epu8(a.n, b.n);
1984 #elif defined(SIMDE_SSE2_NEON) 1985 r.neon_u8 = vminq_u8(a.neon_u8, b.neon_u8);
1988 for (
size_t i = 0; i < (
sizeof(r.u8) /
sizeof(r.u8[0])); i++) {
1989 r.u8[i] = (a.u8[i] < b.u8[i]) ? a.u8[i] : b.u8[i];
2001 #if defined(SIMDE_SSE2_NATIVE) 2002 r.n = _mm_min_pd(a.n, b.n);
2005 for (
size_t i = 0; i < (
sizeof(r.f64) /
sizeof(r.f64[0])); i++) {
2006 r.f64[i] = (a.f64[i] < b.f64[i]) ? a.f64[i] : b.f64[i];
2018 #if defined(SIMDE_SSE2_NATIVE) 2019 r.n = _mm_min_sd(a.n, b.n);
2021 r.f64[0] = (a.f64[0] < b.f64[0]) ? a.f64[0] : b.f64[0];
2022 r.f64[1] = a.f64[1];
2033 #if defined(SIMDE_SSE2_NATIVE) 2034 r.n = _mm_max_epi16(a.n, b.n);
2035 #elif defined(SIMDE_SSE2_NEON) 2036 r.neon_i16 = vmaxq_s16(a.neon_i16, b.neon_i16);
2039 for (
size_t i = 0; i < (
sizeof(r.i16) /
sizeof(r.i16[0])); i++) {
2040 r.i16[i] = (a.i16[i] > b.i16[i]) ? a.i16[i] : b.i16[i];
2052 #if defined(SIMDE_SSE2_NATIVE) 2053 r.n = _mm_max_epu8(a.n, b.n);
2054 #elif defined(SIMDE_SSE2_NEON) 2055 r.neon_u8 = vmaxq_u8(a.neon_u8, b.neon_u8);
2058 for (
size_t i = 0; i < (
sizeof(r.u8) /
sizeof(r.u8[0])); i++) {
2059 r.u8[i] = (a.u8[i] > b.u8[i]) ? a.u8[i] : b.u8[i];
2071 #if defined(SIMDE_SSE2_NATIVE) 2072 r.n = _mm_max_pd(a.n, b.n);
2075 for (
size_t i = 0; i < (
sizeof(r.f64) /
sizeof(r.f64[0])); i++) {
2076 r.f64[i] = (a.f64[i] > b.f64[i]) ? a.f64[i] : b.f64[i];
2088 #if defined(SIMDE_SSE2_NATIVE) 2089 r.n = _mm_max_sd(a.n, b.n);
2091 r.f64[0] = (a.f64[0] > b.f64[0]) ? a.f64[0] : b.f64[0];
2092 r.f64[1] = a.f64[1];
2103 #if defined(SIMDE_SSE2_NATIVE) 2104 r.n = _mm_move_epi64(a.n);
2105 #elif defined(SIMDE_SSE2_NEON) 2106 r.neon_i64 = vsetq_lane_s64(0, a.neon_i64, 1);
2108 r.i64[0] = a.i64[0];
2120 #if defined(SIMDE_SSE2_NATIVE) 2121 r.n = _mm_move_sd(a.n, b.n);
2123 r.f64[0] = b.f64[0];
2124 r.f64[1] = a.f64[1];
2135 #if defined(SIMDE_SSE2_NATIVE) 2136 r.n = _mm_mul_epu32(a.n, b.n);
2139 for (
size_t i = 0; i < (
sizeof(r.u64) /
sizeof(r.u64[0])); i++) {
2140 r.u64[i] = ((uint64_t)a.u32[i * 2]) * ((uint64_t)b.u32[i * 2]);
2153 for (
size_t i = 0; i < (
sizeof(r.i64) /
sizeof(r.i64[0])); i++) {
2154 r.i64[i] = a.i64[i] * b.i64[i];
2166 for (
size_t i = 0; i < (
sizeof(r.i64) /
sizeof(r.i64[0])); i++) {
2167 r.i64[i] = a.i64[i] % b.i64[i];
2178 #if defined(SIMDE_SSE2_NATIVE) 2179 r.n = _mm_mul_pd(a.n, b.n);
2182 for (
size_t i = 0; i < (
sizeof(r.f64) /
sizeof(r.f64[0])); i++) {
2183 r.f64[i] = a.f64[i] * b.f64[i];
2195 #if defined(SIMDE_SSE2_NATIVE) 2196 r.n = _mm_mul_sd(a.n, b.n);
2198 r.f64[0] = a.f64[0] * b.f64[0];
2199 r.f64[1] = a.f64[1];
2210 #if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI) 2211 r.n = _mm_mul_su32(a.n, b.n);
2213 r.
u64[0] = ((uint64_t)a.
u32[0]) * ((uint64_t)b.
u32[0]);
2224 #if defined(SIMDE_SSE2_NATIVE) 2225 r.n = _mm_mulhi_epi16(a.n, b.n);
2226 #elif defined(SIMDE_SSE2_NEON) 2227 int16x4_t a3210 = vget_low_s16(a.neon_i16);
2228 int16x4_t b3210 = vget_low_s16(b.neon_i16);
2229 int32x4_t ab3210 = vmull_s16(a3210, b3210);
2230 int16x4_t a7654 = vget_high_s16(a.neon_i16);
2231 int16x4_t b7654 = vget_high_s16(b.neon_i16);
2232 int32x4_t ab7654 = vmull_s16(a7654, b7654);
2233 uint16x8x2_t rv = vuzpq_u16(vreinterpretq_u16_s32(ab3210),
2234 vreinterpretq_u16_s32(ab7654));
2235 r.neon_u16 = rv.val[1];
2238 for (
size_t i = 0; i < (
sizeof(r.i16) /
sizeof(r.i16[0])); i++) {
2239 r.u16[i] = (uint16_t)(((uint32_t)(((int32_t)a.i16[i]) *
2240 ((int32_t)b.i16[i]))) >>
2253 #if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI) 2254 r.n = _mm_mulhi_epu16(a.n, b.n);
2257 for (
size_t i = 0; i < (
sizeof(r.u16) /
sizeof(r.u16[0])); i++) {
2258 r.u16[i] = (uint16_t)(
2259 (((uint32_t)a.u16[i]) * ((uint32_t)b.u16[i])) >> 16);
2271 #if defined(SIMDE_SSE2_NATIVE) 2272 r.n = _mm_mullo_epi16(a.n, b.n);
2273 #elif defined(SIMDE_SSE2_NEON) 2274 r.neon_i16 = vmulq_s16(a.neon_i16, b.neon_i16);
2277 for (
size_t i = 0; i < (
sizeof(r.i16) /
sizeof(r.i16[0])); i++) {
2278 r.u16[i] = (uint16_t)(((uint32_t)(((int32_t)a.i16[i]) *
2279 ((int32_t)b.i16[i]))) &
2292 #if defined(SIMDE_SSE2_NATIVE) 2293 r.n = _mm_or_pd(a.n, b.n);
2296 for (
size_t i = 0; i < (
sizeof(r.i64) /
sizeof(r.i64[0])); i++) {
2297 r.i64[i] = a.i64[i] | b.i64[i];
2309 #if defined(SIMDE_SSE2_NATIVE) 2310 r.n = _mm_or_si128(a.n, b.n);
2311 #elif defined(SIMDE_SSE2_NEON) 2312 r.neon_i32 = vorrq_s32(a.neon_i32, b.neon_i32);
2315 for (
size_t i = 0; i < (
sizeof(r.i64) /
sizeof(r.i64[0])); i++) {
2316 r.i64[i] = a.i64[i] | b.i64[i];
2328 #if defined(SIMDE_SSE2_NATIVE) 2329 r.n = _mm_packs_epi16(a.n, b.n);
2330 #elif defined(SIMDE_SSE2_NEON) 2331 r.neon_i8 = vcombine_s8(vqmovn_s16(a.neon_i16), vqmovn_s16(b.neon_i16));
2334 for (
size_t i = 0; i < (
sizeof(r.i16) /
sizeof(r.i16[0])); i++) {
2335 r.i8[i] = (a.i16[i] > INT8_MAX)
2337 : ((a.i16[i] < INT8_MIN)
2339 : ((int8_t)a.i16[i]));
2340 r.i8[i + 8] = (b.i16[i] > INT8_MAX)
2342 : ((b.i16[i] < INT8_MIN)
2344 : ((int8_t)b.i16[i]));
2356 #if defined(SIMDE_SSE2_NATIVE) 2357 r.n = _mm_packs_epi32(a.n, b.n);
2358 #elif defined(SIMDE_SSE2_NEON) 2360 vcombine_s16(vqmovn_s32(a.neon_i32), vqmovn_s32(b.neon_i32));
2363 for (
size_t i = 0; i < (
sizeof(r.i32) /
sizeof(r.i32[0])); i++) {
2364 r.i16[i] = (a.i32[i] > INT16_MAX)
2366 : ((a.i32[i] < INT16_MIN)
2368 : ((int16_t)a.i32[i]));
2369 r.i16[i + 4] = (b.i32[i] > INT16_MAX)
2371 : ((b.i32[i] < INT16_MIN)
2373 : ((int16_t)b.i32[i]));
2385 #if defined(SIMDE_SSE2_NATIVE) 2386 r.n = _mm_packus_epi16(a.n, b.n);
2387 #elif defined(SIMDE_SSE2_NEON) 2389 vcombine_u8(vqmovun_s16(a.neon_i16), vqmovun_s16(b.neon_i16));
2392 for (
size_t i = 0; i < (
sizeof(r.i16) /
sizeof(r.i16[0])); i++) {
2393 r.u8[i] = (a.i16[i] > UINT8_MAX)
2395 : ((a.i16[i] < 0) ? 0 : ((int8_t)a.i16[i]));
2397 (b.i16[i] > UINT8_MAX)
2399 : ((b.i16[i] < 0) ? 0 : ((int8_t)b.i16[i]));
2409 #if defined(SIMDE_SSE2_NATIVE) 2419 #if defined(SIMDE_SSE2_NATIVE) 2420 r.n = _mm_sad_epu8(a.n, b.n);
2422 for (
size_t i = 0; i < (
sizeof(r.i64) /
sizeof(r.i64[0])); i++) {
2425 for (
size_t j = 0; j < ((
sizeof(r.u8) /
sizeof(r.u8[0])) / 2);
2427 const size_t e = j + (i * 8);
2428 tmp += (a.u8[e] > b.u8[e]) ? (a.u8[e] - b.u8[e])
2429 : (b.u8[e] - a.u8[e]);
2440 int8_t e11, int8_t e10, int8_t e9, int8_t e8,
2441 int8_t e7, int8_t e6, int8_t e5, int8_t e4,
2442 int8_t e3, int8_t e2, int8_t e1, int8_t e0)
2446 #if defined(SIMDE_SSE2_NATIVE) 2447 r.n = _mm_set_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5, e4,
2473 int16_t e3, int16_t e2, int16_t e1, int16_t e0)
2477 #if defined(SIMDE_SSE2_NATIVE) 2478 r.n = _mm_set_epi16(e7, e6, e5, e4, e3, e2, e1, e0);
2479 #elif defined(SIMDE_SSE2_NEON) 2480 SIMDE_ALIGN(16) int16_t data[8] = {e0, e1, e2, e3, e4, e5, e6, e7};
2481 r.neon_i16 = vld1q_s16(data);
2501 #if defined(SIMDE_SSE2_NATIVE) 2502 r.n = _mm_set_epi32(e3, e2, e1, e0);
2503 #elif defined(SIMDE_SSE2_NEON) 2504 SIMDE_ALIGN(16) int32_t data[4] = {e0, e1, e2, e3};
2505 r.neon_i32 = vld1q_s32(data);
2521 #if defined(SIMDE_SSE2_NATIVE) 2522 r.n = _mm_set_epi64(e1.n, e0.n);
2524 r.i64[0] = e0.
i64[0];
2525 r.i64[1] = e1.
i64[0];
2536 #if defined(SIMDE_SSE2_NATIVE) 2537 r.n = _mm_set_epi64x(e1, e0);
2538 #elif defined(SIMDE_SSE2_NEON) 2539 r = SIMDE__M128I_NEON_C(i64,
2540 vcombine_s64(vdup_n_s64(e0), vdup_n_s64(e1)));
2551 uint8_t e12, uint8_t e11, uint8_t e10,
2552 uint8_t e9, uint8_t e8, uint8_t e7, uint8_t e6,
2553 uint8_t e5, uint8_t e4, uint8_t e3, uint8_t e2,
2554 uint8_t e1, uint8_t e0)
2580 uint16_t e4, uint16_t e3, uint16_t e2,
2581 uint16_t e1, uint16_t e0)
2627 #if defined(SIMDE_SSE2_NATIVE) 2628 r.n = _mm_set_pd(e1, e0);
2642 #if defined(SIMDE_SSE2_NATIVE) 2643 r.n = _mm_set1_pd(a);
2657 #if defined(SIMDE_SSE2_NATIVE) 2658 r.n = _mm_set_sd(a);
2672 #if defined(SIMDE_SSE2_NATIVE) 2673 r.n = _mm_set1_epi8(a);
2674 #elif defined(SIMDE_SSE2_NEON) 2675 r.neon_i8 = vdupq_n_s8(a);
2678 for (
size_t i = 0; i < (
sizeof(r.i8) /
sizeof(r.i8[0])); i++) {
2691 #if defined(SIMDE_SSE2_NATIVE) 2692 r.n = _mm_set1_epi16(a);
2693 #elif defined(SIMDE_SSE2_NEON) 2694 r.neon_i16 = vdupq_n_s16(a);
2697 for (
size_t i = 0; i < (
sizeof(r.i16) /
sizeof(r.i16[0])); i++) {
2710 #if defined(SIMDE_SSE2_NATIVE) 2711 r.n = _mm_set1_epi32(a);
2712 #elif defined(SIMDE_SSE2_NEON) 2713 r.neon_i32 = vdupq_n_s32(a);
2716 for (
size_t i = 0; i < (
sizeof(r.i32) /
sizeof(r.i32[0])); i++) {
2729 #if defined(SIMDE_SSE2_NATIVE) 2730 r.n = _mm_set1_epi64x(a);
2731 #elif defined(SIMDE_SSE2_NEON) 2732 r.neon_i64 = vmovq_n_s64(a);
2735 for (
size_t i = 0; i < (
sizeof(r.i64) /
sizeof(r.i64[0])); i++) {
2748 #if defined(SIMDE_SSE2_NATIVE) 2749 r.n = _mm_set1_epi64(a.n);
2752 for (
size_t i = 0; i < (
sizeof(r.i64) /
sizeof(r.i64[0])); i++) {
2753 r.i64[i] = a.
i64[0];
2765 #if defined(SIMDE_SSE2_NATIVE) 2766 r.n = _mm_set1_pd(a);
2769 for (
size_t i = 0; i < (
sizeof(r.i64) /
sizeof(r.i64[0])); i++) {
2779 int8_t e11, int8_t e10, int8_t e9, int8_t e8,
2780 int8_t e7, int8_t e6, int8_t e5, int8_t e4,
2781 int8_t e3, int8_t e2, int8_t e1, int8_t e0)
2785 #if defined(SIMDE_SSE2_NATIVE) 2786 r.n = _mm_setr_epi8(e15, e14, e13, e12, e11, e10, e9, e8, e7, e6, e5,
2787 e4, e3, e2, e1, e0);
2788 #elif defined(SIMDE_SSE2_NEON) 2789 int8_t t[] = {e15, e14, e13, e12, e11, e10, e9, e8,
2790 e7, e6, e5, e4, e3, e2, e1, e0};
2791 r.neon_i8 = vld1q_s8(t);
2816 int16_t e3, int16_t e2, int16_t e1, int16_t e0)
2820 #if defined(SIMDE_SSE2_NATIVE) 2821 r.n = _mm_setr_epi16(e7, e6, e5, e4, e3, e2, e1, e0);
2822 #elif defined(SIMDE_SSE2_NEON) 2823 int16_t t[] = {e7, e6, e5, e4, e3, e2, e1, e0};
2824 r.neon_i16 = vld1q_s16(t);
2844 #if defined(SIMDE_SSE2_NATIVE) 2845 r.n = _mm_setr_epi32(e3, e2, e1, e0);
2846 #elif defined(SIMDE_SSE2_NEON) 2847 int32_t t[] = {e3, e2, e1, e0};
2848 r.neon_i32 = vld1q_s32(t);
2864 #if defined(SIMDE_SSE2_NATIVE) 2865 r.n = _mm_setr_epi64(e1.n, e0.n);
2866 #elif defined(SIMDE_SSE2_NEON) 2867 r.neon_i64 = vcombine_s64(e1.neon_i64, e0.neon_i64);
2869 r.i64[0] = e1.
i64[0];
2870 r.i64[1] = e0.
i64[0];
2881 #if defined(SIMDE_SSE2_NATIVE) 2882 r.n = _mm_setr_pd(e1, e0);
2896 #if defined(SIMDE_SSE2_NATIVE) 2897 r.n = _mm_setzero_pd();
2911 #if defined(SIMDE_SSE2_NATIVE) 2912 r.n = _mm_setzero_si128();
2913 #elif defined(SIMDE_SSE2_NEON) 2914 r.neon_i32 = vdupq_n_s32(0);
2928 for (
size_t i = 0; i < (
sizeof(r.i32) /
sizeof(r.i32[0])); i++) {
2929 r.i32[i] = a.i32[(imm8 >> (i * 2)) & 3];
2934 #if defined(SIMDE_SSE2_NATIVE) 2935 #define simde_mm_shuffle_epi32(a, imm8) \ 2936 SIMDE__M128I_C(_mm_shuffle_epi32((a).n, (imm8))) 2937 #elif defined(SIMDE__SHUFFLE_VECTOR) 2938 #define simde_mm_shuffle_epi32(a, imm8) \ 2940 const simde__m128i simde__tmp_a_ = a; \ 2941 (simde__m128i){.i32 = SIMDE__SHUFFLE_VECTOR( \ 2942 32, 16, (simde__tmp_a_).i32, \ 2943 (simde__tmp_a_).i32, ((imm8)) & 3, \ 2944 ((imm8) >> 2) & 3, ((imm8) >> 4) & 3, \ 2945 ((imm8) >> 6) & 3)}; \ 2954 r.f64[0] = ((imm8 & 1) == 0) ? a.f64[0] : a.f64[1];
2955 r.f64[1] = ((imm8 & 2) == 0) ? b.f64[0] : b.f64[1];
2959 #if defined(SIMDE_SSE2_NATIVE) && !defined(__PGI) 2960 #define simde_mm_shuffle_pd(a, b, imm8) \ 2961 SIMDE__M128D_C(_mm_shuffle_pd((a).n, (b).n, (imm8))) 2962 #elif defined(SIMDE__SHUFFLE_VECTOR) 2963 #define simde_mm_shuffle_pd(a, b, imm8) \ 2965 (simde__m128d){.f64 = SIMDE__SHUFFLE_VECTOR( \ 2966 64, 16, (a).f64, (b).f64, \ 2968 (((imm8) >> 1) & 1) + 2)}; \ 2977 r.i64[0] = a.i64[0];
2978 for (
size_t i = 4; i < (
sizeof(r.i16) /
sizeof(r.i16[0])); i++) {
2979 r.i16[i] = a.i16[((imm8 >> ((i - 4) * 2)) & 3) + 4];
2984 #if defined(SIMDE_SSE2_NATIVE) 2985 #define simde_mm_shufflehi_epi16(a, imm8) \ 2986 SIMDE__M128I_C(_mm_shufflehi_epi16((a).n, (imm8))) 2987 #elif defined(SIMDE__SHUFFLE_VECTOR) 2988 #define simde_mm_shufflehi_epi16(a, imm8) \ 2990 const simde__m128i simde__tmp_a_ = a; \ 2991 (simde__m128i){.i16 = SIMDE__SHUFFLE_VECTOR( \ 2992 16, 16, (simde__tmp_a_).i16, \ 2993 (simde__tmp_a_).i16, 0, 1, 2, 3, \ 2994 (((imm8)) & 3) + 4, \ 2995 (((imm8) >> 2) & 3) + 4, \ 2996 (((imm8) >> 4) & 3) + 4, \ 2997 (((imm8) >> 6) & 3) + 4)}; \ 3006 for (
size_t i = 0; i < ((
sizeof(r.i16) /
sizeof(r.i16[0])) / 2); i++) {
3007 r.i16[i] = a.i16[((imm8 >> (i * 2)) & 3)];
3009 r.i64[1] = a.i64[1];
3013 #if defined(SIMDE_SSE2_NATIVE) 3014 #define simde_mm_shufflelo_epi16(a, imm8) \ 3015 SIMDE__M128I_C(_mm_shufflelo_epi16((a).n, (imm8))) 3016 #elif defined(SIMDE__SHUFFLE_VECTOR) 3017 #define simde_mm_shufflelo_epi16(a, imm8) \ 3019 const simde__m128i simde__tmp_a_ = a; \ 3020 (simde__m128i){.i16 = SIMDE__SHUFFLE_VECTOR( \ 3021 16, 16, (simde__tmp_a_).i16, \ 3022 (simde__tmp_a_).i16, (((imm8)) & 3), \ 3023 (((imm8) >> 2) & 3), \ 3024 (((imm8) >> 4) & 3), \ 3025 (((imm8) >> 6) & 3), 4, 5, 6, 7)}; \ 3032 #if defined(SIMDE_SSE2_NATIVE) 3033 return SIMDE__M128I_C(_mm_sll_epi16(a.n, count.n));
3037 if (count.u64[0] > 15)
3039 const int s = (int)(count.u64[0]);
3042 for (
size_t i = 0; i < (
sizeof(r.u16) /
sizeof(r.u16[0])); i++) {
3043 r.u16[i] = a.u16[i] << s;
3052 #if defined(SIMDE_SSE2_NATIVE) 3053 return SIMDE__M128I_C(_mm_sll_epi32(a.n, count.n));
3057 if (count.u64[0] > 31)
3059 const int s = (int)(count.u64[0]);
3062 for (
size_t i = 0; i < (
sizeof(r.i32) /
sizeof(r.i32[0])); i++) {
3063 r.i32[i] = a.i32[i] << s;
3072 #if defined(SIMDE_SSE2_NATIVE) 3073 return SIMDE__M128I_C(_mm_sll_epi64(a.n, count.n));
3077 if (count.u64[0] > 63)
3079 const int s = (int)(count.u64[0]);
3082 for (
size_t i = 0; i < (
sizeof(r.i64) /
sizeof(r.i64[0])); i++) {
3083 r.i64[i] = a.i64[i] << s;
3092 #if defined(SIMDE_SSE2_NATIVE) 3093 return SIMDE__M128D_C(_mm_sqrt_pd(a.n));
3098 for (
size_t i = 0; i < (
sizeof(r.f64) /
sizeof(r.f64[0])); i++) {
3099 r.f64[i] = sqrt(a.f64[i]);
3109 #if defined(SIMDE_SSE2_NATIVE) 3110 return SIMDE__M128D_C(_mm_sqrt_sd(a.n, b.n));
3113 r.f64[0] = sqrt(b.f64[0]);
3114 r.f64[1] = a.f64[1];
3122 #if defined(SIMDE_SSE2_NATIVE) 3123 return SIMDE__M128I_C(_mm_srl_epi16(a.n, count.n));
3127 if (count.u64[0] > 15)
3129 const int s = (int)(count.u64[0]);
3132 for (
size_t i = 0; i < (
sizeof(r.u16) /
sizeof(r.u16[0])); i++) {
3133 r.u16[i] = a.u16[i] >> s;
3142 #if defined(SIMDE_SSE2_NATIVE) 3143 return SIMDE__M128I_C(_mm_srl_epi32(a.n, count.n));
3147 if (count.u64[0] > 31)
3149 const int s = (int)(count.u64[0]);
3152 for (
size_t i = 0; i < (
sizeof(r.u32) /
sizeof(r.u32[0])); i++) {
3153 r.u32[i] = a.u32[i] >> s;
3162 #if defined(SIMDE_SSE2_NATIVE) 3163 return SIMDE__M128I_C(_mm_srl_epi64(a.n, count.n));
3167 if (count.u64[0] > 31)
3169 const int s = (int)(count.u64[0]);
3172 for (
size_t i = 0; i < (
sizeof(r.u64) /
sizeof(r.u64[0])); i++) {
3173 r.u64[i] = a.u64[i] >> s;
3185 (uint16_t)((~0U) << ((
sizeof(int16_t) * CHAR_BIT) - imm8));
3188 for (
size_t i = 0; i < (
sizeof(r) /
sizeof(r.u16[0])); i++) {
3189 const uint16_t is_neg = ((uint16_t)(
3190 ((a.u16[i]) >> ((
sizeof(int16_t) * CHAR_BIT) - 1))));
3191 r.u16[i] = (a.u16[i] >> imm8) | (m * is_neg);
3196 #if defined(SIMDE_SSE2_NATIVE) 3197 #define simde_mm_srai_epi16(a, imm8) \ 3198 SIMDE__M128I_C(_mm_srai_epi16((a).n, (imm8))); 3207 (uint32_t)((~0U) << ((
sizeof(int) * CHAR_BIT) - imm8));
3209 for (
size_t i = 0; i < (
sizeof(r) /
sizeof(r.u32[0])); i++) {
3210 uint32_t is_neg = ((uint32_t)(
3211 ((a.u32[i]) >> ((
sizeof(int32_t) * CHAR_BIT) - 1))));
3212 r.u32[i] = (a.u32[i] >> imm8) | (m * is_neg);
3217 #if defined(SIMDE_SSE2_NATIVE) 3218 #define simde_mm_srai_epi32(a, imm8) \ 3219 SIMDE__M128I_C(_mm_srai_epi32((a).n, (imm8))) 3220 #elif defined(SIMDE_SSE2_NEON) 3221 #define simde_mm_srai_epi32(a, imm8) \ 3222 SIMDE__M128I_NEON_C( \ 3227 ? (vshrq_n_s32(vshrq_n_s32(a.neon_i32, 16), \ 3229 : (vshrq_n_s32(a.neon_i32, (imm8))))) 3235 #if defined(SIMDE_SSE2_NATIVE) 3236 return SIMDE__M128I_C(_mm_sra_epi16(a.n, count.n));
3239 int cnt = (int)count.i64[0];
3241 if (cnt > 15 || cnt < 0) {
3242 for (
size_t i = 0; i < (
sizeof(r.i16) /
sizeof(r.i16[0]));
3244 r.u16[i] = (a.i16[i] < 0) ? 0xffff : 0x0000;
3247 const uint16_t m = (uint16_t)(
3248 (~0U) << ((
sizeof(int16_t) * CHAR_BIT) - cnt));
3249 for (
size_t i = 0; i < (
sizeof(r.i16) /
sizeof(r.i16[0]));
3251 const uint16_t is_neg = a.i16[i] < 0;
3252 r.u16[i] = (a.u16[i] >> cnt) | (m * is_neg);
3263 #if defined(SIMDE_SSE2_NATIVE) && !defined(SIMDE_BUG_GCC_BAD_MM_SRA_EPI32) 3264 return SIMDE__M128I_C(_mm_sra_epi32(a.n, count.n));
3267 const uint64_t cnt = count.u64[0];
3270 for (
size_t i = 0; i < (
sizeof(r.i32) /
sizeof(r.i32[0]));
3272 r.u32[i] = (a.i32[i] < 0) ? UINT32_MAX : 0;
3274 }
else if (cnt == 0) {
3275 memcpy(&r, &a,
sizeof(r));
3277 const uint32_t m = (uint32_t)(
3278 (~0U) << ((
sizeof(int32_t) * CHAR_BIT) - cnt));
3279 for (
size_t i = 0; i < (
sizeof(r.i32) /
sizeof(r.i32[0]));
3281 const uint32_t is_neg = a.i32[i] < 0;
3282 r.u32[i] = (a.u32[i] >> cnt) | (m * is_neg);
3294 const int s = (imm8 > ((int)
sizeof(r.i16[0]) * CHAR_BIT) - 1) ? 0
3297 for (
size_t i = 0; i < (
sizeof(r.i16) /
sizeof(r.i16[0])); i++) {
3298 r.i16[i] = a.i16[i] << s;
3302 #if defined(SIMDE_SSE2_NATIVE) 3303 #define simde_mm_slli_epi16(a, imm8) SIMDE__M128I_C(_mm_slli_epi16(a.n, imm8)); 3304 #elif defined(SIMDE_SSE2_NEON) 3305 #define simde_mm_slli_epi16(a, imm8) \ 3306 SIMDE__M128I_NEON_C( \ 3307 i16, ((imm8) <= 0) \ 3309 : (((imm8) > 31) ? (vdupq_n_s16(0)) \ 3310 : (vshlq_n_s16((a).neon_i16, \ 3318 const int s = (imm8 > ((int)
sizeof(r.i32[0]) * CHAR_BIT) - 1) ? 0
3321 for (
size_t i = 0; i < (
sizeof(r.i32) /
sizeof(r.i32[0])); i++) {
3322 r.i32[i] = a.i32[i] << s;
3326 #if defined(SIMDE_SSE2_NATIVE) 3327 #define simde_mm_slli_epi32(a, imm8) SIMDE__M128I_C(_mm_slli_epi32(a.n, imm8)); 3328 #elif defined(SIMDE_SSE2_NEON) 3329 #define simde_mm_slli_epi32(a, imm8) \ 3330 SIMDE__M128I_NEON_C( \ 3331 i32, ((imm8) <= 0) \ 3333 : (((imm8) > 31) ? (vdupq_n_s32(0)) \ 3334 : (vshlq_n_s32((a).neon_i32, \ 3342 const int s = (imm8 > ((int)
sizeof(r.i64[0]) * CHAR_BIT) - 1) ? 0
3345 for (
size_t i = 0; i < (
sizeof(r.i64) /
sizeof(r.i64[0])); i++) {
3346 r.i64[i] = a.i64[i] << s;
3350 #if defined(SIMDE_SSE2_NATIVE) 3351 #define simde_mm_slli_epi64(a, imm8) SIMDE__M128I_C(_mm_slli_epi64(a.n, imm8)); 3358 const int s = (imm8 > ((int)
sizeof(r.i16[0]) * CHAR_BIT) - 1) ? 0
3361 for (
size_t i = 0; i < (
sizeof(r.i16) /
sizeof(r.i16[0])); i++) {
3362 r.u16[i] = a.u16[i] >> s;
3366 #if defined(SIMDE_SSE2_NATIVE) 3367 #define simde_mm_srli_epi16(a, imm8) SIMDE__M128I_C(_mm_srli_epi16(a.n, imm8)); 3368 #elif defined(SIMDE_SSE2_NEON) 3369 #define simde_mm_srli_epi16(a, imm8) \ 3370 SIMDE__M128I_NEON_C( \ 3371 u16, ((imm8) <= 0) \ 3373 : (((imm8) > 31) ? (vdupq_n_u16(0)) \ 3374 : (vshrq_n_u16((a).neon_u16, \ 3382 const int s = (imm8 > ((int)
sizeof(r.i32[0]) * CHAR_BIT) - 1) ? 0
3385 for (
size_t i = 0; i < (
sizeof(r.i32) /
sizeof(r.i32[0])); i++) {
3386 r.u32[i] = a.u32[i] >> s;
3390 #if defined(SIMDE_SSE2_NATIVE) 3391 #define simde_mm_srli_epi32(a, imm8) SIMDE__M128I_C(_mm_srli_epi32(a.n, imm8)) 3392 #elif defined(SIMDE_SSE2_NEON) 3393 #define simde_mm_srli_epi32(a, imm8) \ 3394 SIMDE__M128I_NEON_C( \ 3395 u32, ((imm8) <= 0) \ 3397 : (((imm8) > 31) ? (vdupq_n_u32(0)) \ 3398 : (vshrq_n_u32((a).neon_u32, \ 3406 const unsigned char s = imm8 & 255;
3408 for (
size_t i = 0; i < (
sizeof(r.i64) /
sizeof(r.i64[0])); i++) {
3412 r.u64[i] = a.u64[i] >> s;
3417 #if defined(SIMDE_SSE2_NATIVE) 3418 #define simde_mm_srli_epi64(a, imm8) SIMDE__M128I_C(_mm_srli_epi64(a.n, imm8)) 3419 #elif defined(SIMDE_SSE2_NEON) 3420 #define simde_mm_srli_epi64(a, imm8) \ 3421 SIMDE__M128I_NEON_C( \ 3423 (((imm8)&255) < 0 || ((imm8)&255) > 63) \ 3424 ? (vdupq_n_u64(0)) \ 3425 : ((((imm8)&255) == 0) \ 3427 : (vshrq_n_u64((a).neon_u64, (imm8)&255)))) 3436 #if defined(SIMDE_SSE2_NATIVE) 3437 _mm_store_pd(mem_addr, a.n);
3440 memcpy(mem_addr, &a,
sizeof(a));
3450 #if defined(SIMDE_SSE2_NATIVE) 3451 _mm_store1_pd(mem_addr, a.n);
3454 mem_addr[0] = a.f64[0];
3455 mem_addr[1] = a.f64[0];
3458 #define simde_mm_store_pd1(mem_addr, a) simde_mm_store1_pd(mem_addr, a) 3463 #if defined(SIMDE_SSE2_NATIVE) 3464 _mm_store_sd(mem_addr, a.n);
3466 memcpy(mem_addr, &a,
sizeof(a.f64[0]));
3473 #if defined(SIMDE_SSE2_NATIVE) 3474 _mm_store_si128(&mem_addr->n, a.n);
3475 #elif defined(SIMDE_SSE2_NEON) 3476 vst1q_s32((int32_t *)mem_addr, a.neon_i32);
3479 memcpy(mem_addr, &a,
sizeof(a));
3486 #if defined(SIMDE_SSE2_NATIVE) 3487 _mm_storeh_pd(mem_addr, a.n);
3489 *mem_addr = a.f64[1];
3496 #if defined(SIMDE_SSE2_NATIVE) 3497 _mm_storel_epi64(&(mem_addr->n), a.n);
3498 #elif defined(SIMDE_SSE2_NEON) 3499 mem_addr->i64[0] = vgetq_lane_s64(a.neon_i64, 0);
3501 mem_addr->i64[0] = a.i64[0];
3508 #if defined(SIMDE_SSE2_NATIVE) 3509 _mm_storel_pd(mem_addr, a.n);
3511 *mem_addr = a.f64[0];
3520 #if defined(SIMDE_SSE2_NATIVE) 3521 _mm_storer_pd(mem_addr, a.n);
3524 mem_addr[0] = a.f64[1];
3525 mem_addr[1] = a.f64[0];
3532 #if defined(SIMDE_SSE2_NATIVE) 3533 _mm_storeu_pd(mem_addr, a.n);
3535 memcpy(mem_addr, &a,
sizeof(a));
3542 #if defined(SIMDE_SSE2_NATIVE) 3543 _mm_storeu_si128(&mem_addr->n, a.n);
3544 #elif defined(SIMDE_SSE2_NEON) 3546 vst1q_s32(v, a.neon_i32);
3547 memcpy(mem_addr, v,
sizeof(v));
3549 memcpy(mem_addr, &a,
sizeof(a));
3557 #if defined(SIMDE_SSE2_NATIVE) 3558 _mm_stream_pd(mem_addr, a.n);
3561 memcpy(mem_addr, &a,
sizeof(a));
3568 #if defined(SIMDE_SSE2_NATIVE) 3569 _mm_stream_si128(&mem_addr->n, a.n);
3572 memcpy(mem_addr, &a,
sizeof(a));
3579 #if defined(SIMDE_SSE2_NATIVE) 3580 _mm_stream_si32(mem_addr, a);
3589 #if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE_ARCH_AMD64) 3590 #if defined(SIMDE__REALLY_GCC) && !HEDLEY_GCC_VERSION_CHECK(4, 8, 0) 3592 #elif defined(__GNUC__) 3593 _mm_stream_si64((
long long *)mem_addr, a);
3595 _mm_stream_si64(mem_addr, a);
3605 #if defined(SIMDE_SSE2_NATIVE) 3606 return SIMDE__M128I_C(_mm_sub_epi8(a.n, b.n));
3607 #elif defined(SIMDE_SSE2_NEON) 3608 return SIMDE__M128I_NEON_C(i8, vsubq_s8(a.neon_i8, b.neon_i8));
3612 for (
size_t i = 0; i < (
sizeof(r.i8) /
sizeof(r.i8[0])); i++) {
3613 r.i8[i] = a.i8[i] - b.i8[i];
3622 #if defined(SIMDE_SSE2_NATIVE) 3623 return SIMDE__M128I_C(_mm_sub_epi16(a.n, b.n));
3624 #elif defined(SIMDE_SSE2_NEON) 3625 return SIMDE__M128I_NEON_C(i16, vsubq_s16(a.neon_i16, b.neon_i16));
3629 for (
size_t i = 0; i < (
sizeof(r.i16) /
sizeof(r.i16[0])); i++) {
3630 r.i16[i] = a.i16[i] - b.i16[i];
3639 #if defined(SIMDE_SSE2_NATIVE) 3640 return SIMDE__M128I_C(_mm_sub_epi32(a.n, b.n));
3641 #elif defined(SIMDE_SSE2_NEON) 3642 return SIMDE__M128I_NEON_C(i32, vsubq_s32(a.neon_i32, b.neon_i32));
3646 for (
size_t i = 0; i < (
sizeof(r.i32) /
sizeof(r.i32[0])); i++) {
3647 r.i32[i] = a.i32[i] - b.i32[i];
3656 #if defined(SIMDE_SSE2_NATIVE) 3657 return SIMDE__M128I_C(_mm_sub_epi64(a.n, b.n));
3658 #elif defined(SIMDE_SSE2_NEON) 3659 return SIMDE__M128I_NEON_C(i64, vsubq_s64(a.neon_i64, b.neon_i64));
3663 for (
size_t i = 0; i < (
sizeof(r.i64) /
sizeof(r.i64[0])); i++) {
3664 r.i64[i] = a.i64[i] - b.i64[i];
3673 #if defined(SIMDE_SSE2_NATIVE) 3674 return SIMDE__M128D_C(_mm_sub_pd(a.n, b.n));
3678 for (
size_t i = 0; i < (
sizeof(r.f64) /
sizeof(r.f64[0])); i++) {
3679 r.f64[i] = a.f64[i] - b.f64[i];
3688 #if defined(SIMDE_SSE2_NATIVE) 3689 return SIMDE__M128D_C(_mm_sub_sd(a.n, b.n));
3692 r.f64[0] = a.f64[0] - b.f64[0];
3693 r.f64[1] = a.f64[1];
3701 #if defined(SIMDE_SSE2_NATIVE) 3702 return SIMDE__M64_C(_mm_sub_si64(a.n, b.n));
3713 #if defined(SIMDE_SSE2_NATIVE) 3714 return SIMDE__M128I_C(_mm_subs_epi8(a.n, b.n));
3715 #elif defined(SIMDE_SSE2_NEON) 3716 return SIMDE__M128I_NEON_C(i8, vqsubq_s8(a.neon_i8, b.neon_i8));
3720 for (
size_t i = 0; i < (
sizeof(r) /
sizeof(r.i8[0])); i++) {
3721 if (((b.i8[i]) > 0 && (a.i8[i]) < INT8_MIN + (b.i8[i]))) {
3723 }
else if ((b.i8[i]) < 0 && (a.i8[i]) > INT8_MAX + (b.i8[i])) {
3726 r.i8[i] = (a.i8[i]) - (b.i8[i]);
3736 #if defined(SIMDE_SSE2_NATIVE) 3737 return SIMDE__M128I_C(_mm_subs_epi16(a.n, b.n));
3738 #elif defined(SIMDE_SSE2_NEON) 3739 return SIMDE__M128I_NEON_C(i16, vqsubq_s16(a.neon_i16, b.neon_i16));
3743 for (
size_t i = 0; i < (
sizeof(r) /
sizeof(r.i16[0])); i++) {
3744 if (((b.i16[i]) > 0 && (a.i16[i]) < INT16_MIN + (b.i16[i]))) {
3745 r.i16[i] = INT16_MIN;
3746 }
else if ((b.i16[i]) < 0 &&
3747 (a.i16[i]) > INT16_MAX + (b.i16[i])) {
3748 r.i16[i] = INT16_MAX;
3750 r.i16[i] = (a.i16[i]) - (b.i16[i]);
3760 #if defined(SIMDE_SSE2_NATIVE) 3761 return SIMDE__M128I_C(_mm_subs_epu8(a.n, b.n));
3762 #elif defined(SIMDE_SSE2_NEON) 3763 return SIMDE__M128I_NEON_C(u8, vqsubq_u8(a.neon_u8, b.neon_u8));
3767 for (
size_t i = 0; i < (
sizeof(r) /
sizeof(r.i8[0])); i++) {
3768 const int32_t x = a.u8[i] - b.u8[i];
3771 }
else if (x > UINT8_MAX) {
3772 r.u8[i] = UINT8_MAX;
3774 r.u8[i] = (uint8_t)x;
3784 #if defined(SIMDE_SSE2_NATIVE) 3785 return SIMDE__M128I_C(_mm_subs_epu16(a.n, b.n));
3786 #elif defined(SIMDE_SSE2_NEON) 3787 return SIMDE__M128I_NEON_C(u16, vqsubq_u16(a.neon_u16, b.neon_u16));
3791 for (
size_t i = 0; i < (
sizeof(r) /
sizeof(r.i16[0])); i++) {
3792 const int32_t x = a.u16[i] - b.u16[i];
3795 }
else if (x > UINT16_MAX) {
3796 r.u16[i] = UINT16_MAX;
3798 r.u16[i] = (uint16_t)x;
3808 #if defined(SIMDE_SSE2_NATIVE) 3809 return _mm_ucomieq_sd(a.n, b.n);
3812 int x = feholdexcept(&envp);
3813 int r = a.f64[0] == b.f64[0];
3823 #if defined(SIMDE_SSE2_NATIVE) 3824 return _mm_ucomige_sd(a.n, b.n);
3827 int x = feholdexcept(&envp);
3828 int r = a.f64[0] >= b.f64[0];
3838 #if defined(SIMDE_SSE2_NATIVE) 3839 return _mm_ucomigt_sd(a.n, b.n);
3842 int x = feholdexcept(&envp);
3843 int r = a.f64[0] > b.f64[0];
3853 #if defined(SIMDE_SSE2_NATIVE) 3854 return _mm_ucomile_sd(a.n, b.n);
3857 int x = feholdexcept(&envp);
3858 int r = a.f64[0] <= b.f64[0];
3868 #if defined(SIMDE_SSE2_NATIVE) 3869 return _mm_ucomilt_sd(a.n, b.n);
3872 int x = feholdexcept(&envp);
3873 int r = a.f64[0] < b.f64[0];
3883 #if defined(SIMDE_SSE2_NATIVE) 3884 return _mm_ucomineq_sd(a.n, b.n);
3887 int x = feholdexcept(&envp);
3888 int r = a.f64[0] != b.f64[0];
3900 #if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE__HAVE_UNDEFINED128) 3901 r.n = _mm_undefined_pd();
3914 #if defined(SIMDE_SSE2_NATIVE) && defined(SIMDE__HAVE_UNDEFINED128) 3915 r.n = _mm_undefined_si128();
3926 #if defined(SIMDE_SSE2_NATIVE) 3936 #if defined(SIMDE_SSE2_NATIVE) 3946 #if defined(SIMDE_SSE2_NATIVE) 3947 return SIMDE__M128I_C(_mm_unpackhi_epi8(a.n, b.n));
3948 #elif defined(SIMDE_SSE2_NEON) 3949 int8x8_t a1 = vreinterpret_s8_s16(vget_high_s16(a.neon_i16));
3950 int8x8_t b1 = vreinterpret_s8_s16(vget_high_s16(b.neon_i16));
3951 int8x8x2_t result = vzip_s8(a1, b1);
3952 return SIMDE__M128I_NEON_C(i8,
3953 vcombine_s8(result.val[0], result.val[1]));
3957 for (
size_t i = 0; i < ((
sizeof(r) /
sizeof(r.i8[0])) / 2); i++) {
3958 r.i8[(i * 2)] = a.i8[i + ((
sizeof(r) /
sizeof(r.i8[0])) / 2)];
3960 b.i8[i + ((
sizeof(r) /
sizeof(r.i8[0])) / 2)];
3969 #if defined(SIMDE_SSE2_NATIVE) 3970 return SIMDE__M128I_C(_mm_unpackhi_epi16(a.n, b.n));
3971 #elif defined(SIMDE_SSE2_NEON) 3972 int16x4_t a1 = vget_high_s16(a.neon_i16);
3973 int16x4_t b1 = vget_high_s16(b.neon_i16);
3974 int16x4x2_t result = vzip_s16(a1, b1);
3975 return SIMDE__M128I_NEON_C(i16,
3976 vcombine_s16(result.val[0], result.val[1]));
3980 for (
size_t i = 0; i < ((
sizeof(r) /
sizeof(r.i16[0])) / 2); i++) {
3982 a.i16[i + ((
sizeof(r) /
sizeof(r.i16[0])) / 2)];
3983 r.i16[(i * 2) + 1] =
3984 b.i16[i + ((
sizeof(r) /
sizeof(r.i16[0])) / 2)];
3993 #if defined(SIMDE_SSE2_NATIVE) 3994 return SIMDE__M128I_C(_mm_unpackhi_epi32(a.n, b.n));
3995 #elif defined(SIMDE_SSE2_NEON) 3996 int32x2_t a1 = vget_high_s32(a.neon_i32);
3997 int32x2_t b1 = vget_high_s32(b.neon_i32);
3998 int32x2x2_t result = vzip_s32(a1, b1);
3999 return SIMDE__M128I_NEON_C(i32,
4000 vcombine_s32(result.val[0], result.val[1]));
4004 for (
size_t i = 0; i < ((
sizeof(r) /
sizeof(r.i32[0])) / 2); i++) {
4006 a.i32[i + ((
sizeof(r) /
sizeof(r.i32[0])) / 2)];
4007 r.i32[(i * 2) + 1] =
4008 b.i32[i + ((
sizeof(r) /
sizeof(r.i32[0])) / 2)];
4017 #if defined(SIMDE_SSE2_NATIVE) 4018 return SIMDE__M128I_C(_mm_unpackhi_epi64(a.n, b.n));
4022 for (
size_t i = 0; i < ((
sizeof(r) /
sizeof(r.i64[0])) / 2); i++) {
4024 a.i64[i + ((
sizeof(r) /
sizeof(r.i64[0])) / 2)];
4025 r.i64[(i * 2) + 1] =
4026 b.i64[i + ((
sizeof(r) /
sizeof(r.i64[0])) / 2)];
4035 #if defined(SIMDE_SSE2_NATIVE) 4036 return SIMDE__M128D_C(_mm_unpackhi_pd(a.n, b.n));
4040 for (
size_t i = 0; i < ((
sizeof(r) /
sizeof(r.f64[0])) / 2); i++) {
4042 a.f64[i + ((
sizeof(r) /
sizeof(r.f64[0])) / 2)];
4043 r.f64[(i * 2) + 1] =
4044 b.f64[i + ((
sizeof(r) /
sizeof(r.f64[0])) / 2)];
4053 #if defined(SIMDE_SSE2_NATIVE) 4054 return SIMDE__M128I_C(_mm_unpacklo_epi8(a.n, b.n));
4055 #elif defined(SIMDE_SSE2_NEON) 4056 int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(a.neon_i16));
4057 int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(b.neon_i16));
4058 int8x8x2_t result = vzip_s8(a1, b1);
4059 return SIMDE__M128I_NEON_C(i8,
4060 vcombine_s8(result.val[0], result.val[1]));
4064 for (
size_t i = 0; i < ((
sizeof(r) /
sizeof(r.i8[0])) / 2); i++) {
4065 r.i8[(i * 2)] = a.i8[i];
4066 r.i8[(i * 2) + 1] = b.i8[i];
4075 #if defined(SIMDE_SSE2_NATIVE) 4076 return SIMDE__M128I_C(_mm_unpacklo_epi16(a.n, b.n));
4077 #elif defined(SIMDE_SSE2_NEON) 4078 int16x4_t a1 = vget_low_s16(a.neon_i16);
4079 int16x4_t b1 = vget_low_s16(b.neon_i16);
4080 int16x4x2_t result = vzip_s16(a1, b1);
4081 return SIMDE__M128I_NEON_C(i16,
4082 vcombine_s16(result.val[0], result.val[1]));
4086 for (
size_t i = 0; i < ((
sizeof(r) /
sizeof(r.i16[0])) / 2); i++) {
4087 r.i16[(i * 2)] = a.i16[i];
4088 r.i16[(i * 2) + 1] = b.i16[i];
4097 #if defined(SIMDE_SSE2_NATIVE) 4098 return SIMDE__M128I_C(_mm_unpacklo_epi32(a.n, b.n));
4099 #elif defined(SIMDE_SSE2_NEON) 4100 int32x2_t a1 = vget_low_s32(a.neon_i32);
4101 int32x2_t b1 = vget_low_s32(b.neon_i32);
4102 int32x2x2_t result = vzip_s32(a1, b1);
4103 return SIMDE__M128I_NEON_C(i32,
4104 vcombine_s32(result.val[0], result.val[1]));
4108 for (
size_t i = 0; i < ((
sizeof(r) /
sizeof(r.i32[0])) / 2); i++) {
4109 r.i32[(i * 2)] = a.i32[i];
4110 r.i32[(i * 2) + 1] = b.i32[i];
4119 #if defined(SIMDE_SSE2_NATIVE) 4120 return SIMDE__M128I_C(_mm_unpacklo_epi64(a.n, b.n));
4124 for (
size_t i = 0; i < ((
sizeof(r) /
sizeof(r.i64[0])) / 2); i++) {
4125 r.i64[(i * 2)] = a.i64[i];
4126 r.i64[(i * 2) + 1] = b.i64[i];
4135 #if defined(SIMDE_SSE2_NATIVE) 4136 return SIMDE__M128D_C(_mm_unpacklo_pd(a.n, b.n));
4140 for (
size_t i = 0; i < ((
sizeof(r) /
sizeof(r.f64[0])) / 2); i++) {
4141 r.f64[(i * 2)] = a.f64[i];
4142 r.f64[(i * 2) + 1] = b.f64[i];
4151 #if defined(SIMDE_SSE2_NATIVE) 4152 return SIMDE__M128D_C(_mm_xor_pd(a.n, b.n));
4156 for (
size_t i = 0; i < (
sizeof(r.i64) /
sizeof(r.i64[0])); i++) {
4157 r.i64[i] = a.i64[i] ^ b.i64[i];
4166 #if defined(SIMDE_SSE2_NATIVE) 4167 return SIMDE__M128I_C(_mm_xor_si128(a.n, b.n));
4168 #elif defined(SIMDE_SSE2_NEON) 4169 return SIMDE__M128I_NEON_C(i32, veorq_s32(a.neon_i32, b.neon_i32));
4173 for (
size_t i = 0; i < (
sizeof(r.i32) /
sizeof(r.i32[0])); i++) {
4174 r.i32[i] = a.i32[i] ^ b.i32[i];
4183 #if defined(SIMDE_SSE2_NEON) 4184 return SIMDE__M128I_NEON_C(i32, vmvnq_s32(a.neon_i32));
4188 for (
size_t i = 0; i < (
sizeof(r.i32) /
sizeof(r.i32[0])); i++) {
4189 r.i32[i] = ~(a.i32[i]);
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_undefined_pd(void)
Definition: sse2.h:3896
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_set1_pd(simde_float64 a)
Definition: sse2.h:2761
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_loadr_pd(simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)])
Definition: sse2.h:1781
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_unpacklo_epi32(simde__m128i a, simde__m128i b)
Definition: sse2.h:4095
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_mul_pd(simde__m128d a, simde__m128d b)
Definition: sse2.h:2174
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_stream_si32(int32_t *mem_addr, int32_t a)
Definition: sse2.h:3577
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_set_pd1(simde_float64 a)
Definition: sse2.h:2638
SIMDE_FLOAT32_TYPE simde_float32
Definition: simde-common.h:150
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_cvtsi32_sd(simde__m128d a, int32_t b)
Definition: sse2.h:1437
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_cvtsi64_si128(int64_t a)
Definition: sse2.h:1491
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_x_mm_set_epu16(uint16_t e7, uint16_t e6, uint16_t e5, uint16_t e4, uint16_t e3, uint16_t e2, uint16_t e1, uint16_t e0)
Definition: sse2.h:2579
SIMDE__FUNCTION_ATTRIBUTES simde__m64 simde_mm_sub_si64(simde__m64 a, simde__m64 b)
Definition: sse2.h:3699
#define SIMDE__ASSUME_ALIGNED(ptr, align)
Definition: simde-common.h:251
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_stream_si128(simde__m128i *mem_addr, simde__m128i a)
Definition: sse2.h:3566
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_srai_epi16(simde__m128i a, int imm8)
Definition: sse2.h:3180
SIMDE__FUNCTION_ATTRIBUTES int32_t simde_mm_movemask_epi8(simde__m128i a)
Definition: sse2.h:1875
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_sub_epi8(simde__m128i a, simde__m128i b)
Definition: sse2.h:3603
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_add_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:289
simde__m128
Definition: sse.h:124
SIMDE__FUNCTION_ATTRIBUTES int simde_mm_ucomige_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:3821
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_unpacklo_pd(simde__m128d a, simde__m128d b)
Definition: sse2.h:4133
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_andnot_pd(simde__m128d a, simde__m128d b)
Definition: sse2.h:437
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_add_pd(simde__m128d a, simde__m128d b)
Definition: sse2.h:272
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_adds_epi8(simde__m128i a, simde__m128i b)
Definition: sse2.h:316
#define HEDLEY_ARRAY_PARAM(name)
Definition: hedley.h:1309
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_cvtsi32_si128(int32_t a)
Definition: sse2.h:1452
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_max_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:2084
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_castsi128_pd(simde__m128i a)
Definition: sse2.h:719
SIMDE__BEGIN_DECLS typedef SIMDE_ALIGN(16) union
Definition: sse2.h:83
#define HEDLEY_UNLIKELY(expr)
Definition: hedley.h:1066
#define SIMDE__END_DECLS
Definition: simde-common.h:131
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_x_mm_not_si128(simde__m128i a)
Definition: sse2.h:4181
SIMDE__FUNCTION_ATTRIBUTES int32_t simde_mm_cvttsd_si32(simde__m128d a)
Definition: sse2.h:1576
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_sub_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:3686
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_cmpgt_epi32(simde__m128i a, simde__m128i b)
Definition: sse2.h:1016
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_sll_epi64(simde__m128i a, simde__m128i count)
Definition: sse2.h:3070
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_min_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:2014
#define SIMDE__VECTORIZE_REDUCTION(r)
Definition: simde-common.h:100
SIMDE__FUNCTION_ATTRIBUTES int64_t simde_mm_cvtsi128_si64(simde__m128i a)
Definition: sse2.h:1422
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_div_pd(simde__m128d a, simde__m128d b)
Definition: sse2.h:1601
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_castps_pd(simde__m128 a)
Definition: sse2.h:687
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_sra_epi32(simde__m128i a, simde__m128i count)
Definition: sse2.h:3261
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_cmpgt_epi8(simde__m128i a, simde__m128i b)
Definition: sse2.h:980
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_set1_epi64(simde__m64 a)
Definition: sse2.h:2744
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_sub_epi64(simde__m128i a, simde__m128i b)
Definition: sse2.h:3654
SIMDE__FUNCTION_ATTRIBUTES int simde_mm_comineq_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:647
SIMDE__FUNCTION_ATTRIBUTES int simde_mm_comilt_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:637
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmpnge_pd(simde__m128d a, simde__m128d b)
Definition: sse2.h:1090
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_add_epi64(simde__m128i a, simde__m128i b)
Definition: sse2.h:255
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_cmplt_epi16(simde__m128i a, simde__m128i b)
Definition: sse2.h:888
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_loadh_pd(simde__m128d a, simde_float64 const *mem_addr)
Definition: sse2.h:1730
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_add_epi32(simde__m128i a, simde__m128i b)
Definition: sse2.h:238
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_loadu_pd(simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)])
Definition: sse2.h:1800
int64_t i64[1]
Definition: mmx.h:69
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_madd_epi16(simde__m128i a, simde__m128i b)
Definition: sse2.h:1834
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_store_si128(simde__m128i *mem_addr, simde__m128i a)
Definition: sse2.h:3471
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_cvtepi32_pd(simde__m128i a)
Definition: sse2.h:1210
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_sqrt_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:3107
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_loadl_pd(simde__m128d a, simde_float64 const *mem_addr)
Definition: sse2.h:1765
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmpnge_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:1100
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_castpd_si128(simde__m128d a)
Definition: sse2.h:672
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_or_si128(simde__m128i a, simde__m128i b)
Definition: sse2.h:2305
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_storel_pd(simde_float64 *mem_addr, simde__m128d a)
Definition: sse2.h:3506
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_store1_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128d a)
Definition: sse2.h:3445
simde__m128i
Definition: sse2.h:132
SIMDE__FUNCTION_ATTRIBUTES int simde_mm_ucomineq_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:3881
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmpnle_pd(simde__m128d a, simde__m128d b)
Definition: sse2.h:1130
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_clflush(void const *p)
Definition: sse2.h:587
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_set1_epi16(int16_t a)
Definition: sse2.h:2687
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_cmplt_epi32(simde__m128i a, simde__m128i b)
Definition: sse2.h:906
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_store_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128d a)
Definition: sse2.h:3431
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmplt_pd(simde__m128d a, simde__m128d b)
Definition: sse2.h:924
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_x_mm_mod_epi64(simde__m128i a, simde__m128i b)
Definition: sse2.h:2161
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_unpackhi_epi32(simde__m128i a, simde__m128i b)
Definition: sse2.h:3991
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_xor_pd(simde__m128d a, simde__m128d b)
Definition: sse2.h:4149
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_cvtpi32_pd(simde__m64 a)
Definition: sse2.h:1287
#define HEDLEY_LIKELY(expr)
Definition: hedley.h:1065
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_mullo_epi16(simde__m128i a, simde__m128i b)
Definition: sse2.h:2267
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmpgt_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:1049
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_storeu_si128(simde__m128i *mem_addr, simde__m128i a)
Definition: sse2.h:3540
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_avg_epu8(simde__m128i a, simde__m128i b)
Definition: sse2.h:471
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_packs_epi32(simde__m128i a, simde__m128i b)
Definition: sse2.h:2352
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_x_mm_set_epu64x(uint64_t e1, uint64_t e0)
Definition: sse2.h:2612
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_subs_epu16(simde__m128i a, simde__m128i b)
Definition: sse2.h:3782
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_shufflehi_epi16(simde__m128i a, const int imm8)
Definition: sse2.h:2973
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_unpacklo_epi64(simde__m128i a, simde__m128i b)
Definition: sse2.h:4117
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_avg_epu16(simde__m128i a, simde__m128i b)
Definition: sse2.h:488
SIMDE__FUNCTION_ATTRIBUTES int32_t simde_mm_cvtsi128_si32(simde__m128i a)
Definition: sse2.h:1410
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_cvtps_epi32(simde__m128 a)
Definition: sse2.h:1302
SIMDE__FUNCTION_ATTRIBUTES simde__m64 simde_mm_cvttpd_pi32(simde__m128d a)
Definition: sse2.h:1542
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_movpi64_epi64(simde__m64 a)
Definition: sse2.h:1944
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_cvttps_epi32(simde__m128 a)
Definition: sse2.h:1558
SIMDE__FUNCTION_ATTRIBUTES int32_t simde_mm_cvtsd_si64(simde__m128d a)
Definition: sse2.h:1376
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_undefined_si128(void)
Definition: sse2.h:3910
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_cmpeq_epi16(simde__m128i a, simde__m128i b)
Definition: sse2.h:769
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmpneq_pd(simde__m128d a, simde__m128d b)
Definition: sse2.h:838
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_setzero_si128(void)
Definition: sse2.h:2907
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_setr_epi32(int32_t e3, int32_t e2, int32_t e1, int32_t e0)
Definition: sse2.h:2840
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_x_mm_set_epu8(uint8_t e15, uint8_t e14, uint8_t e13, uint8_t e12, uint8_t e11, uint8_t e10, uint8_t e9, uint8_t e8, uint8_t e7, uint8_t e6, uint8_t e5, uint8_t e4, uint8_t e3, uint8_t e2, uint8_t e1, uint8_t e0)
Definition: sse2.h:2550
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_cvtss_sd(simde__m128d a, simde__m128 b)
Definition: sse2.h:1511
SIMDE__FUNCTION_ATTRIBUTES int32_t simde_mm_movemask_pd(simde__m128d a)
Definition: sse2.h:1915
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_cmpeq_epi8(simde__m128i a, simde__m128i b)
Definition: sse2.h:751
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_x_mm_set_epu32(uint32_t e3, uint32_t e2, uint32_t e1, uint32_t e0)
Definition: sse2.h:2598
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_sfence(void)
Definition: sse.h:2048
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_mulhi_epu16(simde__m128i a, simde__m128i b)
Definition: sse2.h:2249
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_mul_epu32(simde__m128i a, simde__m128i b)
Definition: sse2.h:2131
uint32_t u32[2]
Definition: mmx.h:72
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cvtsd_ss(simde__m128 a, simde__m128d b)
Definition: sse2.h:1391
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_unpackhi_epi8(simde__m128i a, simde__m128i b)
Definition: sse2.h:3944
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cvtpd_ps(simde__m128d a)
Definition: sse2.h:1272
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_div_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:1618
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_store_sd(simde_float64 *mem_addr, simde__m128d a)
Definition: sse2.h:3461
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_set_epi16(int16_t e7, int16_t e6, int16_t e5, int16_t e4, int16_t e3, int16_t e2, int16_t e1, int16_t e0)
Definition: sse2.h:2472
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_move_epi64(simde__m128i a)
Definition: sse2.h:2099
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_adds_epu16(simde__m128i a, simde__m128i b)
Definition: sse2.h:384
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_set1_epi32(int32_t a)
Definition: sse2.h:2706
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_unpacklo_epi8(simde__m128i a, simde__m128i b)
Definition: sse2.h:4051
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_bslli_si128(simde__m128i a, const int imm8)
Definition: sse2.h:505
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_sub_pd(simde__m128d a, simde__m128d b)
Definition: sse2.h:3671
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_castsi128_ps(simde__m128i a)
Definition: sse2.h:734
int32_t i32[2]
Definition: mmx.h:68
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_subs_epu8(simde__m128i a, simde__m128i b)
Definition: sse2.h:3758
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_setr_epi16(int16_t e7, int16_t e6, int16_t e5, int16_t e4, int16_t e3, int16_t e2, int16_t e1, int16_t e0)
Definition: sse2.h:2815
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_storel_epi64(simde__m128i *mem_addr, simde__m128i a)
Definition: sse2.h:3494
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_sub_epi16(simde__m128i a, simde__m128i b)
Definition: sse2.h:3620
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmpunord_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:1196
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmpord_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:1166
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_packs_epi16(simde__m128i a, simde__m128i b)
Definition: sse2.h:2324
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_subs_epi8(simde__m128i a, simde__m128i b)
Definition: sse2.h:3711
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_packus_epi16(simde__m128i a, simde__m128i b)
Definition: sse2.h:2381
SIMDE__FUNCTION_ATTRIBUTES simde__m64 simde_mm_cvtpd_pi32(simde__m128d a)
Definition: sse2.h:1257
#define SIMDE__BEGIN_DECLS
Definition: simde-common.h:130
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmpgt_pd(simde__m128d a, simde__m128d b)
Definition: sse2.h:1034
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmpord_pd(simde__m128d a, simde__m128d b)
Definition: sse2.h:1150
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_max_epi16(simde__m128i a, simde__m128i b)
Definition: sse2.h:2029
#define SIMDE__FUNCTION_ATTRIBUTES
Definition: simde-common.h:121
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_add_epi16(simde__m128i a, simde__m128i b)
Definition: sse2.h:221
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_slli_epi32(simde__m128i a, const int imm8)
Definition: sse2.h:3315
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmpge_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:1077
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_sra_epi16(simde__m128i a, simde__m128i count)
Definition: sse2.h:3233
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmpge_pd(simde__m128d a, simde__m128d b)
Definition: sse2.h:1062
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_stream_pd(simde_float64 mem_addr[HEDLEY_ARRAY_PARAM(2)], simde__m128d a)
Definition: sse2.h:3554
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_bsrli_si128(simde__m128i a, const int imm8)
Definition: sse2.h:546
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_slli_epi64(simde__m128i a, const int imm8)
Definition: sse2.h:3339
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_set_epi8(int8_t e15, int8_t e14, int8_t e13, int8_t e12, int8_t e11, int8_t e10, int8_t e9, int8_t e8, int8_t e7, int8_t e6, int8_t e5, int8_t e4, int8_t e3, int8_t e2, int8_t e1, int8_t e0)
Definition: sse2.h:2439
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_min_epu8(simde__m128i a, simde__m128i b)
Definition: sse2.h:1978
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_cvttpd_epi32(simde__m128d a)
Definition: sse2.h:1526
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_srai_epi32(simde__m128i a, int imm8)
Definition: sse2.h:3202
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmpeq_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:825
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_srli_epi64(simde__m128i a, const int imm8)
Definition: sse2.h:3403
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmpnlt_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:1120
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_sqrt_pd(simde__m128d a)
Definition: sse2.h:3090
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_cvtpd_epi32(simde__m128d a)
Definition: sse2.h:1242
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_unpackhi_epi16(simde__m128i a, simde__m128i b)
Definition: sse2.h:3967
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_srl_epi64(simde__m128i a, simde__m128i count)
Definition: sse2.h:3160
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_srl_epi16(simde__m128i a, simde__m128i count)
Definition: sse2.h:3120
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmpneq_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:857
#define simde_assert_aligned(alignment, val)
Definition: simde-common.h:50
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmple_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:967
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_subs_epi16(simde__m128i a, simde__m128i b)
Definition: sse2.h:3734
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_load_sd(simde_float64 const *mem_addr)
Definition: sse2.h:1696
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_xor_si128(simde__m128i a, simde__m128i b)
Definition: sse2.h:4164
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_load_pd1(simde_float64 const *mem_addr)
Definition: sse2.h:1680
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_andnot_si128(simde__m128i a, simde__m128i b)
Definition: sse2.h:454
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_mulhi_epi16(simde__m128i a, simde__m128i b)
Definition: sse2.h:2220
SIMDE__FUNCTION_ATTRIBUTES int32_t simde_mm_cvtsd_si32(simde__m128d a)
Definition: sse2.h:1366
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_unpackhi_pd(simde__m128d a, simde__m128d b)
Definition: sse2.h:4033
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_and_si128(simde__m128i a, simde__m128i b)
Definition: sse2.h:420
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_cmpgt_epi16(simde__m128i a, simde__m128i b)
Definition: sse2.h:998
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_min_epi16(simde__m128i a, simde__m128i b)
Definition: sse2.h:1959
SIMDE__FUNCTION_ATTRIBUTES int32_t simde_mm_extract_epi16(simde__m128i a, const int imm8)
Definition: sse2.h:1633
SIMDE__FUNCTION_ATTRIBUTES int simde_mm_comige_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:607
SIMDE__FUNCTION_ATTRIBUTES double simde_mm_cvtsd_f64(simde__m128d a)
Definition: sse2.h:1356
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_set_sd(simde_float64 a)
Definition: sse2.h:2653
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmpnlt_pd(simde__m128d a, simde__m128d b)
Definition: sse2.h:1110
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_storer_pd(simde_float64 mem_addr[2], simde__m128d a)
Definition: sse2.h:3516
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_maskmoveu_si128(simde__m128i a, simde__m128i mask, int8_t mem_addr[HEDLEY_ARRAY_PARAM(16)])
Definition: sse2.h:1860
SIMDE__FUNCTION_ATTRIBUTES int simde_mm_ucomilt_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:3866
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_pause(void)
Definition: sse2.h:2407
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_slli_epi16(simde__m128i a, const int imm8)
Definition: sse2.h:3291
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_lfence(void)
Definition: sse2.h:3924
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_castps_si128(simde__m128 a)
Definition: sse2.h:702
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_adds_epu8(simde__m128i a, simde__m128i b)
Definition: sse2.h:365
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_setr_epi8(int8_t e15, int8_t e14, int8_t e13, int8_t e12, int8_t e11, int8_t e10, int8_t e9, int8_t e8, int8_t e7, int8_t e6, int8_t e5, int8_t e4, int8_t e3, int8_t e2, int8_t e1, int8_t e0)
Definition: sse2.h:2778
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmpnle_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:1140
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_srl_epi32(simde__m128i a, simde__m128i count)
Definition: sse2.h:3140
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_cvtps_pd(simde__m128 a)
Definition: sse2.h:1341
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmpeq_pd(simde__m128d a, simde__m128d b)
Definition: sse2.h:805
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_unpacklo_epi16(simde__m128i a, simde__m128i b)
Definition: sse2.h:4073
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_insert_epi16(simde__m128i a, int32_t i, const int imm8)
Definition: sse2.h:1646
SIMDE__FUNCTION_ATTRIBUTES int simde_mm_comieq_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:597
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_setr_epi64(simde__m64 e1, simde__m64 e0)
Definition: sse2.h:2860
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_add_epi8(simde__m128i a, simde__m128i b)
Definition: sse2.h:204
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmpunord_pd(simde__m128d a, simde__m128d b)
Definition: sse2.h:1180
simde__m128d
Definition: sse2.h:175
SIMDE__FUNCTION_ATTRIBUTES simde__m64 simde_mm_add_si64(simde__m64 a, simde__m64 b)
Definition: sse2.h:302
uint64_t u64[1]
Definition: mmx.h:73
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_shuffle_pd(simde__m128d a, simde__m128d b, const int imm8)
Definition: sse2.h:2950
HEDLEY_STATIC_ASSERT(16==sizeof(simde__m128i), "simde__m128i size incorrect")
SIMDE__FUNCTION_ATTRIBUTES int simde_mm_ucomieq_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:3806
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_load_pd(simde_float64 const mem_addr[HEDLEY_ARRAY_PARAM(2)])
Definition: sse2.h:1661
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_cvtsi64_sd(simde__m128d a, int32_t b)
Definition: sse2.h:1471
#define SIMDE__VECTORIZE
Definition: simde-common.h:98
SIMDE__FUNCTION_ATTRIBUTES simde__m64 simde_mm_mul_su32(simde__m64 a, simde__m64 b)
Definition: sse2.h:2206
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_set1_epi8(int8_t a)
Definition: sse2.h:2668
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_min_pd(simde__m128d a, simde__m128d b)
Definition: sse2.h:1997
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_max_epu8(simde__m128i a, simde__m128i b)
Definition: sse2.h:2048
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_cmpeq_epi32(simde__m128i a, simde__m128i b)
Definition: sse2.h:787
SIMDE__FUNCTION_ATTRIBUTES int simde_mm_ucomile_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:3851
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_shuffle_epi32(simde__m128i a, const int imm8)
Definition: sse2.h:2924
SIMDE__FUNCTION_ATTRIBUTES int simde_mm_comile_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:627
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_and_pd(simde__m128d a, simde__m128d b)
Definition: sse2.h:403
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_storeu_pd(simde_float64 *mem_addr, simde__m128d a)
Definition: sse2.h:3530
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_load_si128(simde__m128i const *mem_addr)
Definition: sse2.h:1711
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_adds_epi16(simde__m128i a, simde__m128i b)
Definition: sse2.h:340
SIMDE__FUNCTION_ATTRIBUTES int64_t simde_mm_cvttsd_si64(simde__m128d a)
Definition: sse2.h:1586
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_loadu_si128(simde__m128i const *mem_addr)
Definition: sse2.h:1818
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_setr_pd(simde_float64 e1, simde_float64 e0)
Definition: sse2.h:2877
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_set1_epi64x(int64_t a)
Definition: sse2.h:2725
SIMDE_FLOAT64_TYPE simde_float64
Definition: simde-common.h:160
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_sll_epi32(simde__m128i a, simde__m128i count)
Definition: sse2.h:3050
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cvtepi32_ps(simde__m128i a)
Definition: sse2.h:1225
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_srli_epi16(simde__m128i a, const int imm8)
Definition: sse2.h:3355
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_move_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:2116
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_castpd_ps(simde__m128d a)
Definition: sse2.h:657
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_set_pd(simde_float64 e1, simde_float64 e0)
Definition: sse2.h:2623
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_set_epi64x(int64_t e1, int64_t e0)
Definition: sse2.h:2532
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmplt_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:939
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_mul_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:2191
SIMDE__FUNCTION_ATTRIBUTES int simde_mm_comigt_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:617
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_cmplt_epi8(simde__m128i a, simde__m128i b)
Definition: sse2.h:870
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_cmple_pd(simde__m128d a, simde__m128d b)
Definition: sse2.h:952
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_mfence(void)
Definition: sse2.h:3934
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_set_epi64(simde__m64 e1, simde__m64 e0)
Definition: sse2.h:2517
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_unpackhi_epi64(simde__m128i a, simde__m128i b)
Definition: sse2.h:4015
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_srli_epi32(simde__m128i a, const int imm8)
Definition: sse2.h:3379
SIMDE__FUNCTION_ATTRIBUTES simde__m64 simde_mm_movepi64_pi64(simde__m128i a)
Definition: sse2.h:1930
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_sub_epi32(simde__m128i a, simde__m128i b)
Definition: sse2.h:3637
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_sad_epu8(simde__m128i a, simde__m128i b)
Definition: sse2.h:2415
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_storeh_pd(simde_float64 *mem_addr, simde__m128d a)
Definition: sse2.h:3484
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_x_mm_mul_epi64(simde__m128i a, simde__m128i b)
Definition: sse2.h:2148
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_max_pd(simde__m128d a, simde__m128d b)
Definition: sse2.h:2067
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_shufflelo_epi16(simde__m128i a, const int imm8)
Definition: sse2.h:3002
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_setzero_pd(void)
Definition: sse2.h:2892
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_loadl_epi64(simde__m128i const *mem_addr)
Definition: sse2.h:1747
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_set_epi32(int32_t e3, int32_t e2, int32_t e1, int32_t e0)
Definition: sse2.h:2497
SIMDE__FUNCTION_ATTRIBUTES simde__m128i simde_mm_sll_epi16(simde__m128i a, simde__m128i count)
Definition: sse2.h:3030
SIMDE__FUNCTION_ATTRIBUTES int simde_mm_ucomigt_sd(simde__m128d a, simde__m128d b)
Definition: sse2.h:3836
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_stream_si64(int64_t *mem_addr, int64_t a)
Definition: sse2.h:3587
SIMDE__FUNCTION_ATTRIBUTES simde__m128d simde_mm_or_pd(simde__m128d a, simde__m128d b)
Definition: sse2.h:2288