28 #if !defined(SIMDE__SSE_H) 29 #if !defined(SIMDE__SSE_H) 34 #if defined(SIMDE_SSE_NATIVE) 35 #undef SIMDE_SSE_NATIVE 37 #if defined(SIMDE_SSE_FORCE_NATIVE) 38 #define SIMDE_SSE_NATIVE 39 #elif defined(__SSE__) && !defined(SIMDE_SSE_NO_NATIVE) && \ 40 !defined(SIMDE_NO_NATIVE) 41 #define SIMDE_SSE_NATIVE 42 #elif defined(__ARM_NEON) && !defined(SIMDE_SSE_NO_NEON) && \ 43 !defined(SIMDE_NO_NEON) 44 #define SIMDE_SSE_NEON 47 #if defined(SIMDE_SSE_NATIVE) && !defined(SIMDE_MMX_NATIVE) 48 #if defined(SIMDE_SSE_FORCE_NATIVE) 49 #error Native SSE support requires native MMX support 51 #warning Native SSE support requires native MMX support, disabling 52 #undef SIMDE_SSE_NATIVE 54 #elif defined(SIMDE_SSE_NEON) && !defined(SIMDE_MMX_NEON) 55 #warning SSE3 NEON support requires MMX NEON support, disabling 56 #undef SIMDE_SSE3_NEON 59 #if defined(SIMDE_SSE_NATIVE) 60 #include <xmmintrin.h> 62 #if defined(SIMDE_SSE_NEON) 66 #if !defined(__INTEL_COMPILER) && defined(__STDC_VERSION__) && \ 67 (__STDC_VERSION__ >= 201112L) && !defined(__STDC_NO_ATOMICS__) 68 #include <stdatomic.h> 77 #define SIMDE_ALIGN(alignment) __attribute__((aligned(alignment))) 81 #if defined(SIMDE__ENABLE_GCC_VEC_EXT) 82 int8_t i8 __attribute__((__vector_size__(16), __may_alias__));
83 int16_t i16 __attribute__((__vector_size__(16), __may_alias__));
84 int32_t i32 __attribute__((__vector_size__(16), __may_alias__));
85 int64_t i64 __attribute__((__vector_size__(16), __may_alias__));
86 uint8_t u8 __attribute__((__vector_size__(16), __may_alias__));
87 uint16_t u16 __attribute__((__vector_size__(16), __may_alias__));
88 uint32_t u32 __attribute__((__vector_size__(16), __may_alias__));
89 uint64_t u64 __attribute__((__vector_size__(16), __may_alias__));
90 #if defined(SIMDE__HAVE_INT128) 91 simde_int128 i128 __attribute__((__vector_size__(16), __may_alias__));
92 simde_uint128 u128 __attribute__((__vector_size__(16), __may_alias__));
94 simde_float32 f32 __attribute__((__vector_size__(16), __may_alias__));
104 #if defined(SIMDE__HAVE_INT128) 105 simde_int128 i128[1];
106 simde_uint128 u128[1];
111 #if defined(SIMDE_SSE_NATIVE) 113 #elif defined(SIMDE_SSE_NEON) 122 float32x4_t neon_f32;
126 #if defined(SIMDE_SSE_NATIVE) 128 "__m128 size doesn't match simde__m128 size");
135 #elif defined(SIMDE_SSE_NEON) 136 #define SIMDE__M128_NEON_C(T, expr) \ 137 (simde__m128) { .neon_##T = expr } 146 #if defined(SIMDE_SSE_NATIVE) 147 r.n = _mm_add_ps(a.n, b.n);
148 #elif defined(SIMDE_SSE_NEON) 149 r.neon_f32 = vaddq_f32(a.neon_f32, b.neon_f32);
152 for (
size_t i = 0; i < (
sizeof(r.f32) /
sizeof(r.f32[0])); i++) {
153 r.f32[i] = a.f32[i] + b.f32[i];
165 #if defined(SIMDE_SSE_NATIVE) 166 r.n = _mm_add_ss(a.n, b.n);
167 #elif defined(SIMDE_SSE_NEON) 168 float32_t b0 = vgetq_lane_f32(b.neon_f32, 0);
169 float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
171 r.neon_f32 = vaddq_f32(a.neon_f32, value);
172 #elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION) 173 r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32,
simde_mm_add_ps(a, b).f32,
176 r.f32[0] = a.f32[0] + b.f32[0];
190 #if defined(SIMDE_SSE_NATIVE) 191 r.n = _mm_and_ps(a.n, b.n);
192 #elif defined(SIMDE_SSE_NEON) 193 r.neon_i32 = vandq_s32(a.neon_i32, b.neon_i32);
196 for (
size_t i = 0; i < (
sizeof(r.i32) /
sizeof(r.i32[0])); i++) {
197 r.i32[i] = a.i32[i] & b.i32[i];
209 #if defined(SIMDE_SSE_NATIVE) 210 r.n = _mm_andnot_ps(a.n, b.n);
211 #elif defined(SIMDE_SSE_NEON) 212 r.neon_i32 = vbicq_s32(b.neon_i32, a.neon_i32);
215 for (
size_t i = 0; i < (
sizeof(r.i32) /
sizeof(r.i32[0])); i++) {
216 r.i32[i] = ~(a.i32[i]) & b.i32[i];
228 #if defined(SIMDE_SSE_NATIVE) 229 r.n = _mm_avg_pu16(a.n, b.n);
230 #elif defined(SIMDE_SSE_NEON) 231 r.neon_u16 = vrhadd_u16(b.neon_u16, a.neon_u16);
234 for (
size_t i = 0; i < 4; i++) {
235 r.
u16[i] = (a.
u16[i] + b.
u16[i] + 1) >> 1;
241 #define simde_m_pavgw(a, b) simde_mm_avg_pu16(a, b) 248 #if defined(SIMDE_SSE_NATIVE) 249 r.n = _mm_avg_pu8(a.n, b.n);
250 #elif defined(SIMDE_SSE_NEON) 251 r.neon_u8 = vrhadd_u8(b.neon_u8, a.neon_u8);
254 for (
size_t i = 0; i < 8; i++) {
255 r.
u8[i] = (a.
u8[i] + b.
u8[i] + 1) >> 1;
261 #define simde_m_pavgb(a, b) simde_mm_avg_pu8(a, b) 268 #if defined(SIMDE_SSE_NATIVE) 269 r.n = _mm_cmpeq_ps(a.n, b.n);
270 #elif defined(SIMDE_SSE_NEON) 271 r.neon_u32 = vceqq_f32(a.neon_f32, b.neon_f32);
274 for (
size_t i = 0; i < (
sizeof(r.f32) /
sizeof(r.f32[0])); i++) {
275 r.u32[i] = (a.f32[i] == b.f32[i]) ? 0xffffffff : 0;
287 #if defined(SIMDE_SSE_NATIVE) 288 r.n = _mm_cmpeq_ss(a.n, b.n);
289 #elif defined(SIMDE_SSE_NEON) 291 vreinterpretq_f32_u32(vceqq_f32(a.neon_f32, b.neon_f32));
292 float32x4_t t = vextq_f32(a.neon_f32, s, 1);
293 r.neon_f32 = vextq_f32(t, t, 3);
294 #elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION) 295 r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32,
298 r.u32[0] = (a.f32[0] == b.f32[0]) ? 0xffffffff : 0;
300 for (
size_t i = 1; i < (
sizeof(r.f32) /
sizeof(r.f32[0])); i++) {
313 #if defined(SIMDE_SSE_NATIVE) 314 r.n = _mm_cmpge_ps(a.n, b.n);
315 #elif defined(SIMDE_SSE_NEON) 316 r.neon_u32 = vcgeq_f32(a.neon_f32, b.neon_f32);
319 for (
size_t i = 0; i < (
sizeof(r.f32) /
sizeof(r.f32[0])); i++) {
320 r.u32[i] = (a.f32[i] >= b.f32[i]) ? 0xffffffff : 0;
332 #if defined(SIMDE_SSE_NATIVE) && !defined(__PGI) 333 r.n = _mm_cmpge_ss(a.n, b.n);
334 #elif defined(SIMDE_SSE_NEON) 336 vreinterpretq_f32_u32(vcgeq_f32(a.neon_f32, b.neon_f32));
337 float32x4_t t = vextq_f32(a.neon_f32, s, 1);
338 r.neon_f32 = vextq_f32(t, t, 3);
339 #elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION) 340 r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32,
343 r.u32[0] = (a.f32[0] >= b.f32[0]) ? 0xffffffff : 0;
345 for (
size_t i = 1; i < (
sizeof(r.f32) /
sizeof(r.f32[0])); i++) {
358 #if defined(SIMDE_SSE_NATIVE) 359 r.n = _mm_cmpgt_ps(a.n, b.n);
360 #elif defined(SIMDE_SSE_NEON) 361 r.neon_u32 = vcgtq_f32(a.neon_f32, b.neon_f32);
364 for (
size_t i = 0; i < (
sizeof(r.f32) /
sizeof(r.f32[0])); i++) {
365 r.u32[i] = (a.f32[i] > b.f32[i]) ? 0xffffffff : 0;
377 #if defined(SIMDE_SSE_NATIVE) && !defined(__PGI) 378 r.n = _mm_cmpgt_ss(a.n, b.n);
379 #elif defined(SIMDE_SSE_NEON) 381 vreinterpretq_f32_u32(vcgtq_f32(a.neon_f32, b.neon_f32));
382 float32x4_t t = vextq_f32(a.neon_f32, s, 1);
383 r.neon_f32 = vextq_f32(t, t, 3);
384 #elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION) 385 r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32,
388 r.u32[0] = (a.f32[0] > b.f32[0]) ? 0xffffffff : 0;
390 for (
size_t i = 1; i < (
sizeof(r.f32) /
sizeof(r.f32[0])); i++) {
403 #if defined(SIMDE_SSE_NATIVE) 404 r.n = _mm_cmple_ps(a.n, b.n);
405 #elif defined(SIMDE_SSE_NEON) 406 r.neon_u32 = vcleq_f32(a.neon_f32, b.neon_f32);
409 for (
size_t i = 0; i < (
sizeof(r.f32) /
sizeof(r.f32[0])); i++) {
410 r.u32[i] = (a.f32[i] <= b.f32[i]) ? 0xffffffff : 0;
422 #if defined(SIMDE_SSE_NATIVE) 423 r.n = _mm_cmple_ss(a.n, b.n);
424 #elif defined(SIMDE_SSE_NEON) 426 vreinterpretq_f32_u32(vcleq_f32(a.neon_f32, b.neon_f32));
427 float32x4_t t = vextq_f32(a.neon_f32, s, 1);
428 r.neon_f32 = vextq_f32(t, t, 3);
429 #elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION) 430 r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32,
433 r.u32[0] = (a.f32[0] <= b.f32[0]) ? 0xffffffff : 0;
435 for (
size_t i = 1; i < (
sizeof(r.f32) /
sizeof(r.f32[0])); i++) {
448 #if defined(SIMDE_SSE_NATIVE) 449 r.n = _mm_cmplt_ps(a.n, b.n);
450 #elif defined(SIMDE_SSE_NEON) 451 r.neon_u32 = vcltq_f32(a.neon_f32, b.neon_f32);
454 for (
size_t i = 0; i < (
sizeof(r.f32) /
sizeof(r.f32[0])); i++) {
455 r.u32[i] = (a.f32[i] < b.f32[i]) ? 0xffffffff : 0;
467 #if defined(SIMDE_SSE_NATIVE) 468 r.n = _mm_cmplt_ss(a.n, b.n);
469 #elif defined(SIMDE_SSE_NEON) 471 vreinterpretq_f32_u32(vcltq_f32(a.neon_f32, b.neon_f32));
472 float32x4_t t = vextq_f32(a.neon_f32, s, 1);
473 r.neon_f32 = vextq_f32(t, t, 3);
474 #elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION) 475 r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32,
478 r.u32[0] = (a.f32[0] < b.f32[0]) ? 0xffffffff : 0;
480 for (
size_t i = 1; i < (
sizeof(r.f32) /
sizeof(r.f32[0])); i++) {
493 #if defined(SIMDE_SSE_NATIVE) 494 r.n = _mm_cmpneq_ps(a.n, b.n);
495 #elif defined(SIMDE_SSE_NEON) 496 r.neon_u32 = vmvnq_u32(vceqq_f32(a.neon_f32, b.neon_f32));
499 for (
size_t i = 0; i < (
sizeof(r.f32) /
sizeof(r.f32[0])); i++) {
500 r.u32[i] = (a.f32[i] != b.f32[i]) ? 0xffffffff : 0;
512 #if defined(SIMDE_SSE_NATIVE) 513 r.n = _mm_cmpneq_ss(a.n, b.n);
514 #elif defined(SIMDE_SSE_NEON) 516 vreinterpretq_f32_u32(vceqq_f32(a.neon_f32, b.neon_f32));
518 vreinterpretq_f32_u32(vmvnq_u32(vreinterpretq_u32_f32(e)));
519 float32x4_t t = vextq_f32(a.neon_f32, s, 1);
520 r.neon_f32 = vextq_f32(t, t, 3);
521 #elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION) 522 r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32,
525 r.u32[0] = (a.f32[0] != b.f32[0]) ? 0xffffffff : 0;
527 for (
size_t i = 1; i < (
sizeof(r.f32) /
sizeof(r.f32[0])); i++) {
540 #if defined(SIMDE_SSE_NATIVE) 541 r.n = _mm_cmpnge_ps(a.n, b.n);
542 #elif defined(SIMDE_SSE_NEON) 543 r.neon_u32 = vcltq_f32(a.neon_f32, b.neon_f32);
556 #if defined(SIMDE_SSE_NATIVE) && !defined(__PGI) 557 r.n = _mm_cmpnge_ss(a.n, b.n);
558 #elif defined(SIMDE_SSE_NEON) 560 vreinterpretq_f32_u32(vcltq_f32(a.neon_f32, b.neon_f32));
561 float32x4_t t = vextq_f32(a.neon_f32, s, 1);
562 r.neon_f32 = vextq_f32(t, t, 3);
575 #if defined(SIMDE_SSE_NATIVE) 576 r.n = _mm_cmpngt_ps(a.n, b.n);
577 #elif defined(SIMDE_SSE_NEON) 578 r.neon_u32 = vcleq_f32(a.neon_f32, b.neon_f32);
591 #if defined(SIMDE_SSE_NATIVE) && !defined(__PGI) 592 r.n = _mm_cmpngt_ss(a.n, b.n);
593 #elif defined(SIMDE_SSE_NEON) 595 vreinterpretq_f32_u32(vcleq_f32(a.neon_f32, b.neon_f32));
596 float32x4_t t = vextq_f32(a.neon_f32, s, 1);
597 r.neon_f32 = vextq_f32(t, t, 3);
610 #if defined(SIMDE_SSE_NATIVE) 611 r.n = _mm_cmpnle_ps(a.n, b.n);
612 #elif defined(SIMDE_SSE_NEON) 613 r.neon_u32 = vcgtq_f32(a.neon_f32, b.neon_f32);
626 #if defined(SIMDE_SSE_NATIVE) 627 r.n = _mm_cmpnle_ss(a.n, b.n);
628 #elif defined(SIMDE_SSE_NEON) 630 vreinterpretq_f32_u32(vcgtq_f32(a.neon_f32, b.neon_f32));
631 float32x4_t t = vextq_f32(a.neon_f32, s, 1);
632 r.neon_f32 = vextq_f32(t, t, 3);
645 #if defined(SIMDE_SSE_NATIVE) 646 r.n = _mm_cmpnlt_ps(a.n, b.n);
647 #elif defined(SIMDE_SSE_NEON) 648 r.neon_u32 = vcgeq_f32(a.neon_f32, b.neon_f32);
661 #if defined(SIMDE_SSE_NATIVE) 662 r.n = _mm_cmpnlt_ss(a.n, b.n);
675 #if defined(SIMDE_SSE_NATIVE) 676 r.n = _mm_cmpord_ps(a.n, b.n);
677 #elif defined(SIMDE_SSE_NEON) 681 uint32x4_t ceqaa = vceqq_f32(a.neon_f32, a.neon_f32);
682 uint32x4_t ceqbb = vceqq_f32(b.neon_f32, b.neon_f32);
683 r.neon_u32 = vandq_u32(ceqaa, ceqbb);
686 for (
size_t i = 0; i < (
sizeof(r.f32) /
sizeof(r.f32[0])); i++) {
687 r.u32[i] = (isnan(a.f32[i]) || isnan(b.f32[i])) ? 0
700 #if defined(SIMDE_SSE_NATIVE) 701 r.n = _mm_cmpord_ss(a.n, b.n);
702 #elif defined(SIMDE_SSE_NEON) 703 uint32x4_t ceqaa = vceqq_f32(a.neon_f32, a.neon_f32);
704 uint32x4_t ceqbb = vceqq_f32(b.neon_f32, b.neon_f32);
705 float32x4_t s = vreinterpretq_f32_u32(vandq_u32(ceqaa, ceqbb));
706 float32x4_t t = vextq_f32(a.neon_f32, s, 1);
707 r.neon_f32 = vextq_f32(t, t, 3);
708 #elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION) 709 r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32,
712 r.u32[0] = (isnan(a.f32[0]) || isnan(b.f32[0])) ? 0 : 0xffffffff;
714 for (
size_t i = 1; i < (
sizeof(r.f32) /
sizeof(r.f32[0])); i++) {
727 #if defined(SIMDE_SSE_NATIVE) 728 r.n = _mm_cmpunord_ps(a.n, b.n);
731 for (
size_t i = 0; i < (
sizeof(r.f32) /
sizeof(r.f32[0])); i++) {
732 r.u32[i] = (isnan(a.f32[i]) || isnan(b.f32[i])) ? 0xffffffff
745 #if defined(SIMDE_SSE_NATIVE) && !defined(__PGI) 746 r.n = _mm_cmpunord_ss(a.n, b.n);
747 #elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION) 748 r.f32 = SIMDE__SHUFFLE_VECTOR(
751 r.u32[0] = (isnan(a.f32[0]) || isnan(b.f32[0])) ? 0xffffffff : 0;
753 for (
size_t i = 1; i < (
sizeof(r.f32) /
sizeof(r.f32[0])); i++) {
764 #if defined(SIMDE_SSE_NATIVE) 765 return _mm_comieq_ss(a.n, b.n);
766 #elif defined(SIMDE_SSE_NEON) 767 uint32x4_t a_not_nan = vceqq_f32(a.neon_f32, a.neon_f32);
768 uint32x4_t b_not_nan = vceqq_f32(b.neon_f32, b.neon_f32);
769 uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
770 uint32x4_t a_eq_b = vceqq_f32(a.neon_f32, b.neon_f32);
771 return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_eq_b), 0) != 0) ? 1 : 0;
773 return a.f32[0] == b.f32[0];
780 #if defined(SIMDE_SSE_NATIVE) 781 return _mm_comige_ss(a.n, b.n);
782 #elif defined(SIMDE_SSE_NEON) 783 uint32x4_t a_not_nan = vceqq_f32(a.neon_f32, a.neon_f32);
784 uint32x4_t b_not_nan = vceqq_f32(b.neon_f32, b.neon_f32);
785 uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
786 uint32x4_t a_ge_b = vcgeq_f32(a.neon_f32, b.neon_f32);
787 return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0) ? 1
790 return a.f32[0] >= b.f32[0];
797 #if defined(SIMDE_SSE_NATIVE) 798 return _mm_comigt_ss(a.n, b.n);
799 #elif defined(SIMDE_SSE_NEON) 800 uint32x4_t a_not_nan = vceqq_f32(a.neon_f32, a.neon_f32);
801 uint32x4_t b_not_nan = vceqq_f32(b.neon_f32, b.neon_f32);
802 uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
803 uint32x4_t a_gt_b = vcgtq_f32(a.neon_f32, b.neon_f32);
804 return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0) ? 1
807 return a.f32[0] > b.f32[0];
814 #if defined(SIMDE_SSE_NATIVE) 815 return _mm_comile_ss(a.n, b.n);
816 #elif defined(SIMDE_SSE_NEON) 817 uint32x4_t a_not_nan = vceqq_f32(a.neon_f32, a.neon_f32);
818 uint32x4_t b_not_nan = vceqq_f32(b.neon_f32, b.neon_f32);
819 uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
820 uint32x4_t a_le_b = vcleq_f32(a.neon_f32, b.neon_f32);
821 return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_le_b), 0) != 0) ? 1 : 0;
823 return a.f32[0] <= b.f32[0];
830 #if defined(SIMDE_SSE_NATIVE) 831 return _mm_comilt_ss(a.n, b.n);
832 #elif defined(SIMDE_SSE_NATIVE) 833 uint32x4_t a_not_nan = vceqq_f32(a.neon_f32, a.neon_f32);
834 uint32x4_t b_not_nan = vceqq_f32(b.neon_f32, b.neon_f32);
835 uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
836 uint32x4_t a_lt_b = vcltq_f32(a.neon_f32, b.neon_f32);
837 return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_lt_b), 0) != 0) ? 1 : 0;
839 return a.f32[0] < b.f32[0];
846 #if defined(SIMDE_SSE_NATIVE) 847 return _mm_comineq_ss(a.n, b.n);
848 #elif defined(SIMDE_SSE_NEON) 849 uint32x4_t a_not_nan = vceqq_f32(a.neon_f32, a.neon_f32);
850 uint32x4_t b_not_nan = vceqq_f32(b.neon_f32, b.neon_f32);
851 uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
852 uint32x4_t a_neq_b = vmvnq_u32(vceqq_f32(a.neon_f32, b.neon_f32));
853 return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_neq_b), 0) != 0)
857 return a.f32[0] != b.f32[0];
866 #if defined(SIMDE_SSE_NATIVE) 867 r.n = _mm_cvt_pi2ps(a.n, b.n);
883 #if defined(SIMDE_SSE_NATIVE) 884 r.n = _mm_cvt_ps2pi(a.n);
887 for (
size_t i = 0; i < (
sizeof(r.
i32) /
sizeof(r.
i32[0])); i++) {
888 r.
i32[i] = (int32_t)a.f32[i];
900 #if defined(SIMDE_SSE_NATIVE) 901 r.n = _mm_cvt_si2ss(a.n, b);
915 #if defined(SIMDE_SSE_NATIVE) 916 return _mm_cvt_ss2si(a.n);
918 return (int32_t)a.f32[0];
927 #if defined(SIMDE_SSE_NATIVE) 928 r.n = _mm_cvtpi16_ps(a.n);
931 for (
size_t i = 0; i < (
sizeof(r.f32) /
sizeof(r.f32[0])); i++) {
944 #if defined(SIMDE_SSE_NATIVE) 945 r.n = _mm_cvtpi32_ps(a.n, b.n);
961 #if defined(SIMDE_SSE_NATIVE) 962 r.n = _mm_cvtpi32x2_ps(a.n, b.n);
978 #if defined(SIMDE_SSE_NATIVE) 979 r.n = _mm_cvtpi8_ps(a.n);
995 #if defined(SIMDE_SSE_NATIVE) 996 r.n = _mm_cvtps_pi16(a.n);
999 for (
size_t i = 0; i < (
sizeof(r.
i16) /
sizeof(r.
i16[0])); i++) {
1000 r.
i16[i] = (int16_t)a.f32[i];
1012 #if defined(SIMDE_SSE_NATIVE) 1013 r.n = _mm_cvtps_pi32(a.n);
1016 for (
size_t i = 0; i < (
sizeof(r.
i32) /
sizeof(r.
i32[0])); i++) {
1017 r.
i32[i] = (int32_t)a.f32[i];
1029 #if defined(SIMDE_SSE_NATIVE) 1030 r.n = _mm_cvtps_pi8(a.n);
1033 for (
size_t i = 0; i < (
sizeof(a.f32) /
sizeof(a.f32[0])); i++) {
1034 r.
i8[i] = (int8_t)a.f32[i];
1046 #if defined(SIMDE_SSE_NATIVE) 1047 r.n = _mm_cvtpu16_ps(a.n);
1050 for (
size_t i = 0; i < (
sizeof(r.f32) /
sizeof(r.f32[0])); i++) {
1063 #if defined(SIMDE_SSE_NATIVE) 1064 r.n = _mm_cvtpu8_ps(a.n);
1067 for (
size_t i = 0; i < 4; i++) {
1080 #if defined(SIMDE_SSE_NATIVE) 1081 r.n = _mm_cvtsi32_ss(a.n, b);
1085 for (
size_t i = 1; i < 4; i++) {
1086 r.i32[i] = a.i32[i];
1098 #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_AMD64) 1100 r.n = _mm_cvtsi64_ss(a.n, b);
1102 r.n = _mm_cvtsi64x_ss(a.n, b);
1107 for (
size_t i = 1; i < 4; i++) {
1108 r.i32[i] = a.i32[i];
1118 #if defined(SIMDE_SSE_NATIVE) 1119 return _mm_cvtss_f32(a.n);
1120 #elif defined(SIMDE_SSE_NEON) 1121 return vgetq_lane_f32(a.neon_f32, 0);
1130 #if defined(SIMDE_SSE_NATIVE) 1131 return _mm_cvtss_si32(a.n);
1133 return (int32_t)a.f32[0];
1140 #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_AMD64) 1142 return _mm_cvtss_si64(a.n);
1144 return _mm_cvtss_si64x(a.n);
1147 return (int64_t)a.f32[0];
1156 #if defined(SIMDE_SSE_NATIVE) 1157 r.n = _mm_cvtt_ps2pi(a.n);
1160 for (
size_t i = 0; i < (
sizeof(r.
f32) /
sizeof(r.
f32[0])); i++) {
1161 r.
i32[i] = (int32_t)truncf(a.f32[i]);
1171 #if defined(SIMDE_SSE_NATIVE) 1172 return _mm_cvtt_ss2si(a.n);
1174 return (int32_t)truncf(a.f32[0]);
1183 #if defined(SIMDE_SSE_NATIVE) 1184 r.n = _mm_cvttps_pi32(a.n);
1195 #if defined(SIMDE_SSE_NATIVE) 1196 return _mm_cvttss_si32(a.n);
1198 return (int32_t)truncf(a.f32[0]);
1205 #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_AMD64) 1207 return _mm_cvttss_si64x(a.n);
1209 return _mm_cvttss_si64(a.n);
1212 return (int64_t)truncf(a.f32[0]);
1221 #if defined(SIMDE_SSE_NATIVE) 1222 r.n = _mm_div_ps(a.n, b.n);
1223 #elif defined(SIMDE_SSE_NEON) 1224 float32x4_t recip0 = vrecpeq_f32(b.neon_f32);
1225 float32x4_t recip1 = vmulq_f32(recip0, vrecpsq_f32(recip0, b.neon_f32));
1226 r.neon_f32 = vmulq_f32(a.neon_f32, recip1);
1229 for (
size_t i = 0; i < (
sizeof(r.f32) /
sizeof(r.f32[0])); i++) {
1230 r.f32[i] = a.f32[i] / b.f32[i];
1242 #if defined(SIMDE_SSE_NATIVE) 1243 r.n = _mm_div_ss(a.n, b.n);
1244 #elif defined(SIMDE_SSE_NEON) 1246 r.neon_f32 = vsetq_lane_f32(value, a.neon_f32, 0);
1248 r.f32[0] = a.f32[0] / b.f32[0];
1250 for (
size_t i = 1; i < (
sizeof(r.f32) /
sizeof(r.f32[0])); i++) {
1251 r.f32[i] = a.f32[i];
1263 #if defined(SIMDE_SSE_NATIVE) 1264 #define simde_mm_extract_pi16(a, imm8) _mm_extract_pi16(a.n, imm8) 1266 #define simde_m_pextrw(a, imm8) simde_mm_extract_pi16(a.n, imm8) 1269 #if defined(SIMDE_SSE_NATIVE) 1276 #if defined(FE_TONEAREST) 1282 #if defined(FE_DOWNWARD) 1288 #if defined(FE_UPWARD) 1294 #if defined(FE_TOWARDZERO) 1303 #if defined(SIMDE_SSE_NATIVE) 1304 return _MM_GET_ROUNDING_MODE();
1306 return fegetround();
1313 #if defined(SIMDE_SSE_NATIVE) 1314 _MM_SET_ROUNDING_MODE(a);
1328 #if defined(SIMDE_SSE_NATIVE) && !defined(__PGI) 1329 #define simde_mm_insert_pi16(a, i, imm8) \ 1330 SIMDE__M64_C(_mm_insert_pi16((a).n, i, imm8)); 1332 #define simde_m_pinsrw(a, i, imm8) \ 1333 SIMDE__M64_C(simde_mm_insert_pi16((a).n, i, imm8)); 1343 #if defined(SIMDE_SSE_NATIVE) 1344 r.n = _mm_load_ps(mem_addr);
1345 #elif defined(SIMDE_SSE_NEON) 1346 r.neon_f32 = vld1q_f32(mem_addr);
1348 memcpy(&r, mem_addr,
sizeof(r.f32));
1359 #if defined(SIMDE_SSE_NATIVE) 1360 r.n = _mm_load_ps1(mem_addr);
1364 for (
size_t i = 0; i < (
sizeof(r.i32) /
sizeof(r.i32[0])); i++) {
1377 #if defined(SIMDE_SSE_NATIVE) 1378 r.n = _mm_load_ss(mem_addr);
1379 #elif defined(SIMDE_SSE_NEON) 1380 r.neon_f32 = vsetq_lane_f32(*mem_addr, vdupq_n_f32(0), 0);
1382 r.f32[0] = *mem_addr;
1396 #if defined(SIMDE_SSE_NATIVE) 1397 r.n = _mm_load1_ps(mem_addr);
1398 #elif defined(SIMDE_SSE_NEON) 1399 r.neon_f32 = vld1q_dup_f32(mem_addr);
1412 #if defined(SIMDE_SSE_NATIVE) 1413 r.n = _mm_loadh_pi(a.n, (__m64 *)mem_addr);
1415 r.f32[0] = a.f32[0];
1416 r.f32[1] = a.f32[1];
1417 r.f32[2] = mem_addr->f32[0];
1418 r.f32[3] = mem_addr->f32[1];
1429 #if defined(SIMDE_SSE_NATIVE) 1430 r.n = _mm_loadl_pi(a.n, (__m64 *)mem_addr);
1432 r.f32[0] = mem_addr->f32[0];
1433 r.f32[1] = mem_addr->f32[1];
1434 r.f32[2] = a.f32[2];
1435 r.f32[3] = a.f32[3];
1449 #if defined(SIMDE_SSE_NATIVE) 1450 r.n = _mm_loadr_ps(mem_addr);
1452 r.f32[0] = mem_addr[3];
1453 r.f32[1] = mem_addr[2];
1454 r.f32[2] = mem_addr[1];
1455 r.f32[3] = mem_addr[0];
1467 #if defined(SIMDE_SSE_NATIVE) 1468 r.n = _mm_loadu_ps(mem_addr);
1469 #elif defined(SIMDE_SSE_NEON) 1470 r.neon_f32 = vld1q_f32(mem_addr);
1472 r.f32[0] = mem_addr[0];
1473 r.f32[1] = mem_addr[1];
1474 r.f32[2] = mem_addr[2];
1475 r.f32[3] = mem_addr[3];
1484 #if defined(SIMDE_SSE_NATIVE) 1485 _mm_maskmove_si64(a.n, mask.n, mem_addr);
1488 for (
size_t i = 0; i < (
sizeof(a.
i8) /
sizeof(a.
i8[0])); i++)
1490 mem_addr[i] = a.
i8[i];
1493 #define simde_m_maskmovq(a, mask, mem_addr) \ 1494 simde_mm_maskmove_si64(a, mask, mem_addr) 1501 #if defined(SIMDE_SSE_NATIVE) 1502 r.n = _mm_max_pi16(a.n, b.n);
1505 for (
size_t i = 0; i < (
sizeof(r.
i16) /
sizeof(r.
i16[0])); i++) {
1512 #define simde_m_pmaxsw(a, b) simde_mm_max_pi16(a, b) 1519 #if defined(SIMDE_SSE_NATIVE) 1520 r.n = _mm_max_ps(a.n, b.n);
1521 #elif defined(SIMDE_SSE_NEON) 1522 r.neon_f32 = vmaxq_f32(a.neon_f32, b.neon_f32);
1525 for (
size_t i = 0; i < (
sizeof(r.f32) /
sizeof(r.f32[0])); i++) {
1526 r.f32[i] = (a.f32[i] > b.f32[i]) ? a.f32[i] : b.f32[i];
1538 #if defined(SIMDE_SSE_NATIVE) 1539 r.n = _mm_max_pu8(a.n, b.n);
1542 for (
size_t i = 0; i < (
sizeof(r.
u8) /
sizeof(r.
u8[0])); i++) {
1543 r.
u8[i] = (a.
u8[i] > b.
u8[i]) ? a.
u8[i] : b.
u8[i];
1549 #define simde_m_pmaxub(a, b) simde_mm_max_pu8(a, b) 1556 #if defined(SIMDE_SSE_NATIVE) 1557 r.n = _mm_max_ss(a.n, b.n);
1558 #elif defined(SIMDE_SSE_NEON) 1559 float32_t value = vgetq_lane_f32(vmaxq_f32(a.neon_f32, b.neon_f32), 0);
1560 r.neon_f32 = vsetq_lane_f32(value, a.neon_f32, 0);
1562 r.f32[0] = (a.f32[0] > b.f32[0]) ? a.f32[0] : b.f32[0];
1563 r.f32[1] = a.f32[1];
1564 r.f32[2] = a.f32[2];
1565 r.f32[3] = a.f32[3];
1576 #if defined(SIMDE_SSE_NATIVE) 1577 r.n = _mm_min_pi16(a.n, b.n);
1580 for (
size_t i = 0; i < (
sizeof(r.
i16) /
sizeof(r.
i16[0])); i++) {
1587 #define simde_m_pminsw(a, b) simde_mm_min_pi16(a, b) 1594 #if defined(SIMDE_SSE_NATIVE) 1595 r.n = _mm_min_ps(a.n, b.n);
1596 #elif defined(SIMDE_SSE_NEON) 1597 r.neon_f32 = vminq_f32(a.neon_f32, b.neon_f32);
1600 for (
size_t i = 0; i < (
sizeof(r.f32) /
sizeof(r.f32[0])); i++) {
1601 r.f32[i] = (a.f32[i] < b.f32[i]) ? a.f32[i] : b.f32[i];
1613 #if defined(SIMDE_SSE_NATIVE) 1614 r.n = _mm_min_pu8(a.n, b.n);
1617 for (
size_t i = 0; i < (
sizeof(r.
u8) /
sizeof(r.
u8[0])); i++) {
1618 r.
u8[i] = (a.
u8[i] < b.
u8[i]) ? a.
u8[i] : b.
u8[i];
1624 #define simde_m_pminub(a, b) simde_mm_min_pu8(a, b) 1631 #if defined(SIMDE_SSE_NATIVE) 1632 r.n = _mm_min_ss(a.n, b.n);
1633 #elif defined(SIMDE_SSE_NEON) 1634 float32_t value = vgetq_lane_f32(vminq_f32(a.neon_f32, b.neon_f32), 0);
1635 r.neon_f32 = vsetq_lane_f32(value, a.neon_f32, 0);
1637 r.f32[0] = (a.f32[0] < b.f32[0]) ? a.f32[0] : b.f32[0];
1638 r.f32[1] = a.f32[1];
1639 r.f32[2] = a.f32[2];
1640 r.f32[3] = a.f32[3];
1651 #if defined(SIMDE_SSE_NATIVE) 1652 r.n = _mm_move_ss(a.n, b.n);
1654 r.f32[0] = b.f32[0];
1655 r.f32[1] = a.f32[1];
1656 r.f32[2] = a.f32[2];
1657 r.f32[3] = a.f32[3];
1668 #if defined(SIMDE_SSE_NATIVE) 1669 r.n = _mm_movehl_ps(a.n, b.n);
1671 r.f32[0] = b.f32[2];
1672 r.f32[1] = b.f32[3];
1673 r.f32[2] = a.f32[2];
1674 r.f32[3] = a.f32[3];
1685 #if defined(SIMDE_SSE_NATIVE) 1686 r.n = _mm_movelh_ps(a.n, b.n);
1688 r.f32[0] = a.f32[0];
1689 r.f32[1] = a.f32[1];
1690 r.f32[2] = b.f32[0];
1691 r.f32[3] = b.f32[1];
1700 #if defined(SIMDE_SSE_NATIVE) 1701 return _mm_movemask_pi8(a.n);
1704 const size_t nmemb =
sizeof(a.
i8) /
sizeof(a.
i8[0]);
1707 for (
size_t i = 0; i < nmemb; i++) {
1708 r |= (a.
u8[nmemb - 1 - i] >> 7) << (nmemb - 1 - i);
1714 #define simde_m_pmovmskb(a, b) simde_mm_movemask_pi8(a, b) 1719 #if defined(SIMDE_SSE_NATIVE) 1720 return _mm_movemask_ps(a.n);
1721 #elif defined(SIMDE_SSE_NEON) 1723 static const uint32x4_t movemask = {1, 2, 4, 8};
1724 static const uint32x4_t highbit = {0x80000000, 0x80000000, 0x80000000,
1726 uint32x4_t t0 = a.neon_u32;
1727 uint32x4_t t1 = vtstq_u32(t0, highbit);
1728 uint32x4_t t2 = vandq_u32(t1, movemask);
1729 uint32x2_t t3 = vorr_u32(vget_low_u32(t2), vget_high_u32(t2));
1730 return vget_lane_u32(t3, 0) | vget_lane_u32(t3, 1);
1735 for (
size_t i = 0; i <
sizeof(a.u32) /
sizeof(a.u32[0]); i++) {
1736 r |= (a.u32[i] >> ((
sizeof(a.u32[i]) * CHAR_BIT) - 1)) << i;
1748 #if defined(SIMDE_SSE_NATIVE) 1749 r.n = _mm_mul_ps(a.n, b.n);
1750 #elif defined(SIMDE_SSE_NEON) 1751 r.neon_f32 = vmulq_f32(a.neon_f32, b.neon_f32);
1754 for (
size_t i = 0; i < (
sizeof(r.f32) /
sizeof(r.f32[0])); i++) {
1755 r.f32[i] = a.f32[i] * b.f32[i];
1767 #if defined(SIMDE_SSE_NATIVE) 1768 r.n = _mm_mul_ss(a.n, b.n);
1770 r.f32[0] = a.f32[0] * b.f32[0];
1771 r.f32[1] = a.f32[1];
1772 r.f32[2] = a.f32[2];
1773 r.f32[3] = a.f32[3];
1784 #if defined(SIMDE_SSE_NATIVE) 1785 r.n = _mm_mulhi_pu16(a.n, b.n);
1788 for (
size_t i = 0; i < (
sizeof(r.
u16) /
sizeof(r.
u16[0])); i++) {
1795 #define simde_m_pmulhuw(a, b) simde_mm_mulhi_pu16(a, b) 1802 #if defined(SIMDE_SSE_NATIVE) 1803 r.n = _mm_or_ps(a.n, b.n);
1804 #elif defined(SIMDE_SSE_NEON) 1805 r.neon_i32 = vorrq_s32(a.neon_i32, b.neon_i32);
1808 for (
size_t i = 0; i < (
sizeof(r.u32) /
sizeof(r.u32[0])); i++) {
1809 r.u32[i] = a.u32[i] | b.u32[i];
1822 #if defined(SIMDE_SSE_NATIVE) 1823 #define simde_mm_prefetch(p, i) _mm_prefetch(p, i) 1831 #if defined(SIMDE_SSE_NATIVE) 1832 r.n = _mm_rcp_ps(a.n);
1833 #elif defined(SIMDE_SSE_NEON) 1834 float32x4_t recip = vrecpeq_f32(a.neon_f32);
1836 #if !defined(SIMDE_MM_RCP_PS_ITERS) 1837 #define SIMDE_MM_RCP_PS_ITERS SIMDE_ACCURACY_ITERS 1840 for (
int i = 0; i < SIMDE_MM_RCP_PS_ITERS; ++i) {
1841 recip = vmulq_f32(recip, vrecpsq_f32(recip, a.neon_f32));
1847 for (
size_t i = 0; i < (
sizeof(r.f32) /
sizeof(r.f32[0])); i++) {
1848 r.f32[i] = 1.0f / a.f32[i];
1860 #if defined(SIMDE_SSE_NATIVE) 1861 r.n = _mm_rcp_ss(a.n);
1863 r.f32[0] = 1.0f / a.f32[0];
1864 r.f32[1] = a.f32[1];
1865 r.f32[2] = a.f32[2];
1866 r.f32[3] = a.f32[3];
1877 #if defined(SIMDE_SSE_NATIVE) 1878 r.n = _mm_rsqrt_ps(a.n);
1879 #elif defined(SIMDE_SSE_NEON) 1880 r.neon_f32 = vrsqrteq_f32(a.neon_f32);
1881 #elif defined(__STDC_IEC_559__) 1884 for (
size_t i = 0; i < (
sizeof(r.f32) /
sizeof(r.f32[0])); i++) {
1885 r.i32[i] = INT32_C(0x5f3759df) - (a.i32[i] >> 1);
1887 #if SIMDE_ACCURACY_ITERS > 2 1891 (
half * r.f32[i] * r.f32[i]);
1896 for (
size_t i = 0; i < (
sizeof(r.f32) /
sizeof(r.f32[0])); i++) {
1897 r.f32[i] = 1.0f / sqrtf(a.f32[i]);
1909 #if defined(SIMDE_SSE_NATIVE) 1910 r.n = _mm_rsqrt_ss(a.n);
1911 #elif defined(__STDC_IEC_559__) 1913 r.i32[0] = INT32_C(0x5f3759df) - (a.i32[0] >> 1);
1915 #if SIMDE_ACCURACY_ITERS > 2 1919 (
half * r.f32[0] * r.f32[0]);
1922 r.f32[0] = 1.0f / sqrtf(a.f32[0]);
1923 r.f32[1] = a.f32[1];
1924 r.f32[2] = a.f32[2];
1925 r.f32[3] = a.f32[3];
1927 r.f32[0] = 1.0f / sqrtf(a.f32[0]);
1928 r.f32[1] = a.f32[1];
1929 r.f32[2] = a.f32[2];
1930 r.f32[3] = a.f32[3];
1941 #if defined(SIMDE_SSE_NATIVE) 1942 r.n = _mm_sad_pu8(a.n, b.n);
1947 for (
size_t i = 0; i < (
sizeof(r.
u8) /
sizeof(r.
u8[0])); i++) {
1948 sum += (uint8_t)abs(a.
u8[i] - b.
u8[i]);
1959 #define simde_m_psadbw(a, b) simde_mm_sad_pu8(a, b) 1967 #if defined(SIMDE_SSE_NATIVE) 1968 r.n = _mm_set_ps(e3, e2, e1, e0);
1969 #elif defined(SIMDE_SSE_NEON) 1971 r.neon_f32 = vld1q_f32(data);
1987 #if defined(SIMDE_SSE_NATIVE) 1988 r.n = _mm_set1_ps(a);
1989 #elif defined(SIMDE_SSE_NEON) 1990 r.neon_f32 = vdupq_n_f32(a);
1997 #define simde_mm_set1_ps(a) simde_mm_set_ps1(a) 2004 #if defined(SIMDE_SSE_NATIVE) 2005 r.n = _mm_set_ss(a);
2019 #if defined(SIMDE_SSE_NATIVE) 2020 r.n = _mm_setr_ps(e3, e2, e1, e0);
2021 #elif defined(SIMDE_SSE_NEON) 2023 r.neon_f32 = vld1q_f32(data);
2036 #if defined(SIMDE_SSE_NATIVE) 2037 r.n = _mm_setzero_ps();
2038 #elif defined(SIMDE_SSE_NEON) 2039 r.neon_f32 = vdupq_n_f32(0.0f);
2051 #if defined(SIMDE_SSE_NATIVE) 2053 #elif defined(__GNUC__) && \ 2054 ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7)) 2055 __atomic_thread_fence(__ATOMIC_SEQ_CST);
2056 #elif !defined(__INTEL_COMPILER) && defined(__STDC_VERSION__) && \ 2057 (__STDC_VERSION__ >= 201112L) && !defined(__STDC_NO_ATOMICS__) 2058 #if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ < 9) 2059 __atomic_thread_fence(__ATOMIC_SEQ_CST);
2061 atomic_thread_fence(memory_order_seq_cst);
2063 #elif defined(_MSC_VER) 2065 #elif defined(__GNUC__) && \ 2066 ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7)) 2067 __atomic_thread_fence(__ATOMIC_SEQ_CST);
2068 #elif HEDLEY_CLANG_HAS_FEATURE(c_atomic) 2069 __c11_atomic_thread_fence(__ATOMIC_SEQ_CST)
2070 #elif defined(__GNUC__) && \ 2071 ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1)) 2072 __sync_synchronize();
2073 #elif (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x5140)) || \ 2074 (defined(__SUNPRO_CC) && (__SUNPRO_CC >= 0x5140)) 2075 __atomic_thread_fence(__ATOMIC_SEQ_CST);
2076 #elif defined(_OPENMP) 2077 #pragma omp critical(simde_mm_sfence_) 2083 #define SIMDE_MM_SHUFFLE(z, y, x, w) \ 2084 (((z) << 6) | ((y) << 4) | ((x) << 2) | (w)) 2090 for (
size_t i = 0; i <
sizeof(r.
u16) /
sizeof(r.
u16[0]); i++) {
2091 r.
i16[i] = a.
i16[(imm8 >> (i * 2)) & 3];
2095 #if defined(SIMDE_SSE_NATIVE) && !defined(__PGI) 2096 #define simde_mm_shuffle_pi16(a, imm8) SIMDE__M64_C(_mm_shuffle_pi16(a.n, imm8)) 2097 #elif defined(SIMDE__SHUFFLE_VECTOR) 2098 #define simde_mm_shuffle_pi16(a, imm8) \ 2100 const simde__m64 simde__tmp_a_ = a; \ 2101 (simde__m64){.i16 = SIMDE__SHUFFLE_VECTOR( \ 2102 16, 8, (simde__tmp_a_).i16, \ 2103 (simde__tmp_a_).i16, (((imm8)) & 3), \ 2104 (((imm8) >> 2) & 3), (((imm8) >> 4) & 3), \ 2105 (((imm8) >> 6) & 3))}; \ 2109 #if defined(SIMDE_SSE_NATIVE) && !defined(__PGI) 2110 #define simde_m_pshufw(a, imm8) SIMDE__M64_C(_m_pshufw(a.n, imm8)) 2112 #define simde_m_pshufw(a, imm8) simde_mm_shuffle_pi16(a, imm8) 2119 r.f32[0] = a.f32[(imm8 >> 0) & 3];
2120 r.f32[1] = a.f32[(imm8 >> 2) & 3];
2121 r.f32[2] = b.f32[(imm8 >> 4) & 3];
2122 r.f32[3] = b.f32[(imm8 >> 6) & 3];
2125 #if defined(SIMDE_SSE_NATIVE) && !defined(__PGI) 2126 #define simde_mm_shuffle_ps(a, b, imm8) \ 2127 SIMDE__M128_C(_mm_shuffle_ps(a.n, b.n, imm8)) 2128 #elif defined(SIMDE__SHUFFLE_VECTOR) 2129 #define simde_mm_shuffle_ps(a, b, imm8) \ 2131 (simde__m128){.f32 = SIMDE__SHUFFLE_VECTOR( \ 2132 32, 16, (a).f32, (b).f32, \ 2133 (((imm8)) & 3), (((imm8) >> 2) & 3), \ 2134 (((imm8) >> 4) & 3) + 4, \ 2135 (((imm8) >> 6) & 3) + 4)}; \ 2144 #if defined(SIMDE_SSE_NATIVE) 2145 r.n = _mm_sqrt_ps(a.n);
2146 #elif defined(SIMDE_SSE_NEON) 2147 float32x4_t recipsq = vrsqrteq_f32(a.neon_f32);
2148 float32x4_t sq = vrecpeq_f32(recipsq);
2153 for (
size_t i = 0; i <
sizeof(r.f32) /
sizeof(r.f32[0]); i++) {
2154 r.f32[i] = sqrtf(a.f32[i]);
2166 #if defined(SIMDE_SSE_NATIVE) 2167 r.n = _mm_sqrt_ss(a.n);
2168 #elif defined(SIMDE_SSE_NEON) 2170 r.neon_f32 = vsetq_lane_f32(value, a.neon_f32, 0);
2172 r.f32[0] = sqrtf(a.f32[0]);
2173 r.f32[1] = a.f32[1];
2174 r.f32[2] = a.f32[2];
2175 r.f32[3] = a.f32[3];
2186 #if defined(SIMDE_SSE_NATIVE) 2187 _mm_store_ps(mem_addr, a.n);
2188 #elif defined(SIMDE_SSE_NEON) 2189 vst1q_f32(mem_addr, a.neon_f32);
2192 for (
size_t i = 0; i <
sizeof(a.f32) /
sizeof(a.f32[0]); i++) {
2193 mem_addr[i] = a.f32[i];
2203 #if defined(SIMDE_SSE_NATIVE) 2204 _mm_store_ps1(mem_addr, a.n);
2207 for (
size_t i = 0; i <
sizeof(a.f32) /
sizeof(a.f32[0]); i++) {
2208 mem_addr[i] = a.f32[0];
2216 #if defined(SIMDE_SSE_NATIVE) 2217 _mm_store_ss(mem_addr, a.n);
2218 #elif defined(SIMDE_SSE_NEON) 2219 vst1q_lane_f32(mem_addr, a.neon_f32, 0);
2221 *mem_addr = a.f32[0];
2230 #if defined(SIMDE_SSE_NATIVE) 2231 _mm_store1_ps(mem_addr, a.n);
2240 #if defined(SIMDE_SSE_NATIVE) 2241 _mm_storeh_pi(&(mem_addr->n), a.n);
2243 mem_addr->
f32[0] = a.f32[2];
2244 mem_addr->
f32[1] = a.f32[3];
2251 #if defined(SIMDE_SSE_NATIVE) 2252 _mm_storel_pi(&(mem_addr->n), a.n);
2254 mem_addr->
f32[0] = a.f32[0];
2255 mem_addr->
f32[1] = a.f32[1];
2264 #if defined(SIMDE_SSE_NATIVE) 2265 _mm_storer_ps(mem_addr, a.n);
2268 for (
size_t i = 0; i <
sizeof(a.f32) /
sizeof(a.f32[0]); i++) {
2270 a.f32[((
sizeof(a.f32) /
sizeof(a.f32[0])) - 1) - i];
2278 #if defined(SIMDE_SSE_NATIVE) 2279 _mm_storeu_ps(mem_addr, a.n);
2280 #elif defined(SIMDE_SSE_NEON) 2281 vst1q_f32(mem_addr, a.neon_f32);
2284 for (
size_t i = 0; i <
sizeof(a.f32) /
sizeof(a.f32[0]); i++) {
2285 mem_addr[i] = a.f32[i];
2295 #if defined(SIMDE_SSE_NATIVE) 2296 r.n = _mm_sub_ps(a.n, b.n);
2297 #elif defined(SIMDE_SSE_NEON) 2298 r.neon_f32 = vsubq_f32(a.neon_f32, b.neon_f32);
2301 for (
size_t i = 0; i < (
sizeof(r.f32) /
sizeof(r.f32[0])); i++) {
2302 r.f32[i] = a.f32[i] - b.f32[i];
2314 #if defined(SIMDE_SSE_NATIVE) 2315 r.n = _mm_sub_ss(a.n, b.n);
2317 r.f32[0] = a.f32[0] - b.f32[0];
2318 r.f32[1] = a.f32[1];
2319 r.f32[2] = a.f32[2];
2320 r.f32[3] = a.f32[3];
2329 #if defined(SIMDE_SSE_NATIVE) 2330 return _mm_ucomieq_ss(a.n, b.n);
2333 int x = feholdexcept(&envp);
2334 int r = a.f32[0] == b.f32[0];
2344 #if defined(SIMDE_SSE_NATIVE) 2345 return _mm_ucomige_ss(a.n, b.n);
2348 int x = feholdexcept(&envp);
2349 int r = a.f32[0] >= b.f32[0];
2359 #if defined(SIMDE_SSE_NATIVE) 2360 return _mm_ucomigt_ss(a.n, b.n);
2363 int x = feholdexcept(&envp);
2364 int r = a.f32[0] > b.f32[0];
2374 #if defined(SIMDE_SSE_NATIVE) 2375 return _mm_ucomile_ss(a.n, b.n);
2378 int x = feholdexcept(&envp);
2379 int r = a.f32[0] <= b.f32[0];
2389 #if defined(SIMDE_SSE_NATIVE) 2390 return _mm_ucomilt_ss(a.n, b.n);
2393 int x = feholdexcept(&envp);
2394 int r = a.f32[0] < b.f32[0];
2404 #if defined(SIMDE_SSE_NATIVE) 2405 return _mm_ucomineq_ss(a.n, b.n);
2408 int x = feholdexcept(&envp);
2409 int r = a.f32[0] != b.f32[0];
2416 #if defined(SIMDE_SSE_NATIVE) 2417 #if defined(__has_builtin) 2418 #if __has_builtin(__builtin_ia32_undef128) 2419 #define SIMDE__HAVE_UNDEFINED128 2421 #elif !defined(__PGI) && !defined(SIMDE_BUG_GCC_REV_208793) 2422 #define SIMDE__HAVE_UNDEFINED128 2431 #if defined(SIMDE__HAVE_UNDEFINED128) 2432 r.n = _mm_undefined_ps();
2445 #if defined(SIMDE_SSE_NATIVE) 2446 r.n = _mm_unpackhi_ps(a.n, b.n);
2447 #elif defined(SIMDE_SSE_NEON) 2448 float32x2_t a1 = vget_high_f32(a.neon_f32);
2449 float32x2_t b1 = vget_high_f32(b.neon_f32);
2450 float32x2x2_t result = vzip_f32(a1, b1);
2451 r.neon_f32 = vcombine_f32(result.val[0], result.val[1]);
2453 r.f32[0] = a.f32[2];
2454 r.f32[1] = b.f32[2];
2455 r.f32[2] = a.f32[3];
2456 r.f32[3] = b.f32[3];
2467 #if defined(SIMDE_SSE_NATIVE) 2468 r.n = _mm_unpacklo_ps(a.n, b.n);
2469 #elif defined(SIMDE_SSE_NEON) 2470 float32x2_t a1 = vget_low_f32(a.neon_f32);
2471 float32x2_t b1 = vget_low_f32(b.neon_f32);
2472 float32x2x2_t result = vzip_f32(a1, b1);
2473 r.neon_f32 = vcombine_f32(result.val[0], result.val[1]);
2475 r.f32[0] = a.f32[0];
2476 r.f32[1] = b.f32[0];
2477 r.f32[2] = a.f32[1];
2478 r.f32[3] = b.f32[1];
2489 #if defined(SIMDE_SSE_NATIVE) 2490 r.n = _mm_xor_ps(a.n, b.n);
2491 #elif defined(SIMDE_SSE_NEON) 2492 r.neon_i32 = veorq_s32(a.neon_i32, b.neon_i32);
2495 for (
size_t i = 0; i < (
sizeof(r.u32) /
sizeof(r.u32[0])); i++) {
2496 r.u32[i] = a.u32[i] ^ b.u32[i];
2506 #if defined(SIMDE_SSE_NATIVE) 2507 _mm_stream_pi(&(mem_addr->n), a.n);
2509 mem_addr->
i64[0] = a.
i64[0];
2518 #if defined(SIMDE_SSE_NATIVE) 2519 _mm_stream_ps(mem_addr, a.n);
2522 memcpy(mem_addr, &a,
sizeof(a));
2529 #if defined(SIMDE_SSE_NATIVE) 2530 return _mm_getcsr();
2533 int rounding_mode = fegetround();
2535 switch (rounding_mode) {
2556 #if defined(SIMDE_SSE_NATIVE) 2559 switch ((a >> 13) & 3) {
2561 fesetround(FE_TONEAREST);
2564 fesetround(FE_DOWNWARD);
2567 fesetround(FE_UPWARD);
2570 fesetround(FE_TOWARDZERO);
2576 #define SIMDE_MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ 2578 simde__m128 tmp3, tmp2, tmp1, tmp0; \ 2579 tmp0 = simde_mm_unpacklo_ps((row0), (row1)); \ 2580 tmp2 = simde_mm_unpacklo_ps((row2), (row3)); \ 2581 tmp1 = simde_mm_unpackhi_ps((row0), (row1)); \ 2582 tmp3 = simde_mm_unpackhi_ps((row2), (row3)); \ 2583 row0 = simde_mm_movelh_ps(tmp0, tmp2); \ 2584 row1 = simde_mm_movehl_ps(tmp2, tmp0); \ 2585 row2 = simde_mm_movelh_ps(tmp1, tmp3); \ 2586 row3 = simde_mm_movehl_ps(tmp3, tmp1); \ SIMDE__FUNCTION_ATTRIBUTES simde__m64 simde_mm_max_pu8(simde__m64 a, simde__m64 b)
Definition: sse.h:1534
SIMDE__FUNCTION_ATTRIBUTES int32_t simde_mm_extract_pi16(simde__m64 a, const int imm8)
Definition: sse.h:1259
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_loadh_pi(simde__m128 a, simde__m64 const *mem_addr)
Definition: sse.h:1408
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cvtpi16_ps(simde__m64 a)
Definition: sse.h:923
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpge_ps(simde__m128 a, simde__m128 b)
Definition: sse.h:309
SIMDE_FLOAT32_TYPE simde_float32
Definition: simde-common.h:150
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cvtpu8_ps(simde__m64 a)
Definition: sse.h:1059
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_undefined_ps(void)
Definition: sse.h:2427
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cvtpu16_ps(simde__m64 a)
Definition: sse.h:1042
#define SIMDE__ASSUME_ALIGNED(ptr, align)
Definition: simde-common.h:251
SIMDE__FUNCTION_ATTRIBUTES int32_t simde_mm_cvttss_si32(simde__m128 a)
Definition: sse.h:1193
simde__m128
Definition: sse.h:124
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_store_ps1(simde_float32 mem_addr[4], simde__m128 a)
Definition: sse.h:2199
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpnge_ps(simde__m128 a, simde__m128 b)
Definition: sse.h:536
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpnge_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:552
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpngt_ps(simde__m128 a, simde__m128 b)
Definition: sse.h:571
#define HEDLEY_ARRAY_PARAM(name)
Definition: hedley.h:1309
SIMDE__FUNCTION_ATTRIBUTES void simde_MM_SET_ROUNDING_MODE(unsigned int a)
Definition: sse.h:1311
SIMDE__FUNCTION_ATTRIBUTES simde__m64 simde_mm_cvt_ps2pi(simde__m128 a)
Definition: sse.h:879
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_rsqrt_ps(simde__m128 a)
Definition: sse.h:1873
#define SIMDE__END_DECLS
Definition: simde-common.h:131
SIMDE__FUNCTION_ATTRIBUTES simde__m64 simde_mm_cvttps_pi32(simde__m128 a)
Definition: sse.h:1179
int16_t i16[4]
Definition: mmx.h:67
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_load_ps(simde_float32 const mem_addr[HEDLEY_ARRAY_PARAM(4)])
Definition: sse.h:1337
SIMDE__FUNCTION_ATTRIBUTES simde__m64 simde_mm_cvtps_pi8(simde__m128 a)
Definition: sse.h:1025
#define SIMDE__VECTORIZE_REDUCTION(r)
Definition: simde-common.h:100
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_unpacklo_ps(simde__m128 a, simde__m128 b)
Definition: sse.h:2463
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cvtsi64_ss(simde__m128 a, int64_t b)
Definition: sse.h:1094
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_load_ss(simde_float32 const *mem_addr)
Definition: sse.h:1373
SIMDE__FUNCTION_ATTRIBUTES int simde_mm_ucomigt_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:2357
SIMDE__FUNCTION_ATTRIBUTES int simde_mm_movemask_pi8(simde__m64 a)
Definition: sse.h:1698
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_stream_ps(simde_float32 mem_addr[4], simde__m128 a)
Definition: sse.h:2514
SIMDE__FUNCTION_ATTRIBUTES simde__m64 simde_mm_max_pi16(simde__m64 a, simde__m64 b)
Definition: sse.h:1497
int64_t i64[1]
Definition: mmx.h:69
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_movehl_ps(simde__m128 a, simde__m128 b)
Definition: sse.h:1664
SIMDE__FUNCTION_ATTRIBUTES uint32_t simde_mm_getcsr(void)
Definition: sse.h:2527
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_storeh_pi(simde__m64 *mem_addr, simde__m128 a)
Definition: sse.h:2238
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_andnot_ps(simde__m128 a, simde__m128 b)
Definition: sse.h:205
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_add_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:161
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpord_ps(simde__m128 a, simde__m128 b)
Definition: sse.h:671
SIMDE__FUNCTION_ATTRIBUTES int32_t simde_mm_cvtt_ss2si(simde__m128 a)
Definition: sse.h:1169
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_loadu_ps(simde_float32 const mem_addr[HEDLEY_ARRAY_PARAM(4)])
Definition: sse.h:1463
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpngt_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:587
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_setr_ps(simde_float32 e3, simde_float32 e2, simde_float32 e1, simde_float32 e0)
Definition: sse.h:2014
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cvtpi32_ps(simde__m128 a, simde__m64 b)
Definition: sse.h:940
SIMDE__FUNCTION_ATTRIBUTES int simde_mm_comilt_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:828
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpgt_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:373
SIMDE__FUNCTION_ATTRIBUTES int simde_mm_ucomieq_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:2327
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpge_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:328
simde_float32 f32[2]
Definition: mmx.h:74
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_rcp_ps(simde__m128 a)
Definition: sse.h:1827
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_mul_ps(simde__m128 a, simde__m128 b)
Definition: sse.h:1744
#define HEDLEY_LIKELY(expr)
Definition: hedley.h:1065
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_or_ps(simde__m128 a, simde__m128 b)
Definition: sse.h:1798
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_storer_ps(simde_float32 mem_addr[4], simde__m128 a)
Definition: sse.h:2260
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpneq_ps(simde__m128 a, simde__m128 b)
Definition: sse.h:489
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_rsqrt_ss(simde__m128 a)
Definition: sse.h:1905
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_prefetch(char const *p, int i)
Definition: sse.h:1817
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_maskmove_si64(simde__m64 a, simde__m64 mask, char *mem_addr)
Definition: sse.h:1482
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_store1_ps(simde_float32 mem_addr[4], simde__m128 a)
Definition: sse.h:2226
SIMDE__FUNCTION_ATTRIBUTES int simde_mm_ucomile_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:2372
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_loadr_ps(simde_float32 const mem_addr[HEDLEY_ARRAY_PARAM(4)])
Definition: sse.h:1443
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_min_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:1627
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_rcp_ss(simde__m128 a)
Definition: sse.h:1856
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_sfence(void)
Definition: sse.h:2048
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpneq_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:508
SIMDE__FUNCTION_ATTRIBUTES int simde_mm_comineq_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:844
SIMDE__FUNCTION_ATTRIBUTES simde__m64 simde_mm_min_pi16(simde__m64 a, simde__m64 b)
Definition: sse.h:1572
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cvtpi32x2_ps(simde__m64 a, simde__m64 b)
Definition: sse.h:957
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_move_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:1647
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_setzero_ps(void)
Definition: sse.h:2032
SIMDE__FUNCTION_ATTRIBUTES simde__m64 simde_mm_cvtps_pi32(simde__m128 a)
Definition: sse.h:1008
SIMDE__FUNCTION_ATTRIBUTES unsigned int simde_MM_GET_ROUNDING_MODE(void)
Definition: sse.h:1301
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_set_ps1(simde_float32 a)
Definition: sse.h:1983
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_add_ps(simde__m128 a, simde__m128 b)
Definition: sse.h:142
SIMDE__FUNCTION_ATTRIBUTES simde__m64 simde_mm_sad_pu8(simde__m64 a, simde__m64 b)
Definition: sse.h:1937
int32_t i32[2]
Definition: mmx.h:68
SIMDE__FUNCTION_ATTRIBUTES simde__m64 simde_mm_avg_pu16(simde__m64 a, simde__m64 b)
Definition: sse.h:224
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmple_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:418
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_storel_pi(simde__m64 *mem_addr, simde__m128 a)
Definition: sse.h:2249
SIMDE__FUNCTION_ATTRIBUTES int64_t simde_mm_cvtss_si64(simde__m128 a)
Definition: sse.h:1138
SIMDE__FUNCTION_ATTRIBUTES int simde_mm_comile_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:812
#define SIMDE__BEGIN_DECLS
Definition: simde-common.h:130
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_stream_pi(simde__m64 *mem_addr, simde__m64 a)
Definition: sse.h:2504
SIMDE__FUNCTION_ATTRIBUTES int simde_mm_comige_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:778
#define SIMDE__FUNCTION_ATTRIBUTES
Definition: simde-common.h:121
#define SIMDE_FLOAT32_C(value)
Definition: simde-common.h:146
SIMDE__FUNCTION_ATTRIBUTES simde__m64 simde_mm_mulhi_pu16(simde__m64 a, simde__m64 b)
Definition: sse.h:1780
uint8_t u8[8]
Definition: mmx.h:70
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_max_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:1552
#define SIMDE__VECTORIZE_ALIGNED(a)
Definition: simde-common.h:101
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_movelh_ps(simde__m128 a, simde__m128 b)
Definition: sse.h:1681
SIMDE__FUNCTION_ATTRIBUTES simde__m64 simde_mm_cvtps_pi16(simde__m128 a)
Definition: sse.h:991
SIMDE__FUNCTION_ATTRIBUTES int64_t simde_mm_cvttss_si64(simde__m128 a)
Definition: sse.h:1203
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpgt_ps(simde__m128 a, simde__m128 b)
Definition: sse.h:354
SIMDE__FUNCTION_ATTRIBUTES int simde_mm_ucomige_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:2342
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpord_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:696
SIMDE__FUNCTION_ATTRIBUTES int simde_mm_movemask_ps(simde__m128 a)
Definition: sse.h:1717
#define SIMDE_ACCURACY_ITERS
Definition: simde-common.h:216
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_div_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:1238
SIMDE__FUNCTION_ATTRIBUTES int simde_mm_ucomineq_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:2402
#define SIMDE_ALIGN(alignment)
Definition: sse.h:77
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_min_ps(simde__m128 a, simde__m128 b)
Definition: sse.h:1590
#define simde_assert_aligned(alignment, val)
Definition: simde-common.h:50
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpnle_ps(simde__m128 a, simde__m128 b)
Definition: sse.h:606
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_sqrt_ps(simde__m128 a)
Definition: sse.h:2140
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_unpackhi_ps(simde__m128 a, simde__m128 b)
Definition: sse.h:2441
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmplt_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:463
SIMDE__FUNCTION_ATTRIBUTES simde_float32 simde_mm_cvtss_f32(simde__m128 a)
Definition: sse.h:1116
SIMDE__FUNCTION_ATTRIBUTES int simde_mm_comieq_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:762
SIMDE__FUNCTION_ATTRIBUTES simde__m64 simde_mm_cvtt_ps2pi(simde__m128 a)
Definition: sse.h:1152
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_store_ss(simde_float32 *mem_addr, simde__m128 a)
Definition: sse.h:2214
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_div_ps(simde__m128 a, simde__m128 b)
Definition: sse.h:1217
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_sub_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:2310
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_and_ps(simde__m128 a, simde__m128 b)
Definition: sse.h:186
uint16_t u16[4]
Definition: mmx.h:71
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_storeu_ps(simde_float32 mem_addr[4], simde__m128 a)
Definition: sse.h:2276
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_load_ps1(simde_float32 const *mem_addr)
Definition: sse.h:1355
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpnlt_ps(simde__m128 a, simde__m128 b)
Definition: sse.h:641
HEDLEY_STATIC_ASSERT(16==sizeof(simde__m128), "simde__m128 size incorrect")
SIMDE__FUNCTION_ATTRIBUTES int32_t simde_mm_cvt_ss2si(simde__m128 a)
Definition: sse.h:913
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cvtpi8_ps(simde__m64 a)
Definition: sse.h:974
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmple_ps(simde__m128 a, simde__m128 b)
Definition: sse.h:399
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cvtsi32_ss(simde__m128 a, int32_t b)
Definition: sse.h:1076
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_mul_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:1763
SIMDE__FUNCTION_ATTRIBUTES int32_t simde_mm_cvtss_si32(simde__m128 a)
Definition: sse.h:1128
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpunord_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:741
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cvt_si2ss(simde__m128 a, int32_t b)
Definition: sse.h:896
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cvt_pi2ps(simde__m128 a, simde__m64 b)
Definition: sse.h:862
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_set_ps(simde_float32 e3, simde_float32 e2, simde_float32 e1, simde_float32 e0)
Definition: sse.h:1962
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_max_ps(simde__m128 a, simde__m128 b)
Definition: sse.h:1515
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpnle_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:622
#define SIMDE__VECTORIZE
Definition: simde-common.h:98
SIMDE__FUNCTION_ATTRIBUTES simde__m64 simde_mm_insert_pi16(simde__m64 a, int16_t i, const int imm8)
Definition: sse.h:1321
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmplt_ps(simde__m128 a, simde__m128 b)
Definition: sse.h:444
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_shuffle_ps(simde__m128 a, simde__m128 b, const int imm8)
Definition: sse.h:2116
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_setcsr(uint32_t a)
Definition: sse.h:2554
SIMDE__FUNCTION_ATTRIBUTES int simde_mm_ucomilt_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:2387
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_sqrt_ss(simde__m128 a)
Definition: sse.h:2162
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpnlt_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:657
SIMDE__FUNCTION_ATTRIBUTES int simde_mm_comigt_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:795
SIMDE__FUNCTION_ATTRIBUTES simde__m64 simde_mm_avg_pu8(simde__m64 a, simde__m64 b)
Definition: sse.h:244
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_set_ss(simde_float32 a)
Definition: sse.h:2000
SIMDE__FUNCTION_ATTRIBUTES simde__m64 simde_mm_shuffle_pi16(simde__m64 a, const int imm8)
Definition: sse.h:2087
SIMDE__FUNCTION_ATTRIBUTES simde__m64 simde_mm_min_pu8(simde__m64 a, simde__m64 b)
Definition: sse.h:1609
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_sub_ps(simde__m128 a, simde__m128 b)
Definition: sse.h:2291
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpunord_ps(simde__m128 a, simde__m128 b)
Definition: sse.h:723
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_xor_ps(simde__m128 a, simde__m128 b)
Definition: sse.h:2485
int8_t i8[8]
Definition: mmx.h:66
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_loadl_pi(simde__m128 a, simde__m64 const *mem_addr)
Definition: sse.h:1425
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_store_ps(simde_float32 mem_addr[4], simde__m128 a)
Definition: sse.h:2182
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_load1_ps(simde_float32 const *mem_addr)
Definition: sse.h:1392
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpeq_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:283
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpeq_ps(simde__m128 a, simde__m128 b)
Definition: sse.h:264