Open Broadcaster Software
Free, open source software for live streaming and recording
sse.h
Go to the documentation of this file.
1 /* Permission is hereby granted, free of charge, to any person
2  * obtaining a copy of this software and associated documentation
3  * files (the "Software"), to deal in the Software without
4  * restriction, including without limitation the rights to use, copy,
5  * modify, merge, publish, distribute, sublicense, and/or sell copies
6  * of the Software, and to permit persons to whom the Software is
7  * furnished to do so, subject to the following conditions:
8  *
9  * The above copyright notice and this permission notice shall be
10  * included in all copies or substantial portions of the Software.
11  *
12  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
13  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
14  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
15  * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
16  * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
17  * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
18  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19  * SOFTWARE.
20  *
21  * Copyright:
22  * 2017 Evan Nemerson <evan@nemerson.com>
23  * 2015-2017 John W. Ratcliff <jratcliffscarab@gmail.com>
24  * 2015 Brandon Rowlett <browlett@nvidia.com>
25  * 2015 Ken Fast <kfast@gdeb.com>
26  */
27 
28 #if !defined(SIMDE__SSE_H)
29 #if !defined(SIMDE__SSE_H)
30 #define SIMDE__SSE_H
31 #endif
32 #include "mmx.h"
33 
34 #if defined(SIMDE_SSE_NATIVE)
35 #undef SIMDE_SSE_NATIVE
36 #endif
37 #if defined(SIMDE_SSE_FORCE_NATIVE)
38 #define SIMDE_SSE_NATIVE
39 #elif defined(__SSE__) && !defined(SIMDE_SSE_NO_NATIVE) && \
40  !defined(SIMDE_NO_NATIVE)
41 #define SIMDE_SSE_NATIVE
42 #elif defined(__ARM_NEON) && !defined(SIMDE_SSE_NO_NEON) && \
43  !defined(SIMDE_NO_NEON)
44 #define SIMDE_SSE_NEON
45 #endif
46 
47 #if defined(SIMDE_SSE_NATIVE) && !defined(SIMDE_MMX_NATIVE)
48 #if defined(SIMDE_SSE_FORCE_NATIVE)
49 #error Native SSE support requires native MMX support
50 #else
51 #warning Native SSE support requires native MMX support, disabling
52 #undef SIMDE_SSE_NATIVE
53 #endif
54 #elif defined(SIMDE_SSE_NEON) && !defined(SIMDE_MMX_NEON)
55 #warning SSE3 NEON support requires MMX NEON support, disabling
56 #undef SIMDE_SSE3_NEON
57 #endif
58 
59 #if defined(SIMDE_SSE_NATIVE)
60 #include <xmmintrin.h>
61 #else
62 #if defined(SIMDE_SSE_NEON)
63 #include <arm_neon.h>
64 #endif
65 
66 #if !defined(__INTEL_COMPILER) && defined(__STDC_VERSION__) && \
67  (__STDC_VERSION__ >= 201112L) && !defined(__STDC_NO_ATOMICS__)
68 #include <stdatomic.h>
69 #elif defined(_WIN32)
70 #include <Windows.h>
71 #endif
72 #endif
73 
74 #include <math.h>
75 #include <fenv.h>
76 
77 #define SIMDE_ALIGN(alignment) __attribute__((aligned(alignment)))
79 
80 typedef SIMDE_ALIGN(16) union {
81 #if defined(SIMDE__ENABLE_GCC_VEC_EXT)
82  int8_t i8 __attribute__((__vector_size__(16), __may_alias__));
83  int16_t i16 __attribute__((__vector_size__(16), __may_alias__));
84  int32_t i32 __attribute__((__vector_size__(16), __may_alias__));
85  int64_t i64 __attribute__((__vector_size__(16), __may_alias__));
86  uint8_t u8 __attribute__((__vector_size__(16), __may_alias__));
87  uint16_t u16 __attribute__((__vector_size__(16), __may_alias__));
88  uint32_t u32 __attribute__((__vector_size__(16), __may_alias__));
89  uint64_t u64 __attribute__((__vector_size__(16), __may_alias__));
90 #if defined(SIMDE__HAVE_INT128)
91  simde_int128 i128 __attribute__((__vector_size__(16), __may_alias__));
92  simde_uint128 u128 __attribute__((__vector_size__(16), __may_alias__));
93 #endif
94  simde_float32 f32 __attribute__((__vector_size__(16), __may_alias__));
95 #else
96  int8_t i8[16];
97  int16_t i16[8];
98  int32_t i32[4];
99  int64_t i64[2];
100  uint8_t u8[16];
101  uint16_t u16[8];
102  uint32_t u32[4];
103  uint64_t u64[2];
104 #if defined(SIMDE__HAVE_INT128)
105  simde_int128 i128[1];
106  simde_uint128 u128[1];
107 #endif
108  simde_float32 f32[4];
109 #endif
110 
111 #if defined(SIMDE_SSE_NATIVE)
112  __m128 n;
113 #elif defined(SIMDE_SSE_NEON)
114  int8x16_t neon_i8;
115  int16x8_t neon_i16;
116  int32x4_t neon_i32;
117  int64x2_t neon_i64;
118  uint8x16_t neon_u8;
119  uint16x8_t neon_u16;
120  uint32x4_t neon_u32;
121  uint64x2_t neon_u64;
122  float32x4_t neon_f32;
123 #endif
125 
126 #if defined(SIMDE_SSE_NATIVE)
127 HEDLEY_STATIC_ASSERT(sizeof(__m128) == sizeof(simde__m128),
128  "__m128 size doesn't match simde__m128 size");
129 SIMDE__FUNCTION_ATTRIBUTES simde__m128 SIMDE__M128_C(__m128 v)
130 {
131  simde__m128 r;
132  r.n = v;
133  return r;
134 }
135 #elif defined(SIMDE_SSE_NEON)
136 #define SIMDE__M128_NEON_C(T, expr) \
137  (simde__m128) { .neon_##T = expr }
138 #endif
139 HEDLEY_STATIC_ASSERT(16 == sizeof(simde__m128), "simde__m128 size incorrect");
140 
143 {
144  simde__m128 r;
145 
146 #if defined(SIMDE_SSE_NATIVE)
147  r.n = _mm_add_ps(a.n, b.n);
148 #elif defined(SIMDE_SSE_NEON)
149  r.neon_f32 = vaddq_f32(a.neon_f32, b.neon_f32);
150 #else
152  for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
153  r.f32[i] = a.f32[i] + b.f32[i];
154  }
155 #endif
156 
157  return r;
158 }
159 
162 {
163  simde__m128 r;
164 
165 #if defined(SIMDE_SSE_NATIVE)
166  r.n = _mm_add_ss(a.n, b.n);
167 #elif defined(SIMDE_SSE_NEON)
168  float32_t b0 = vgetq_lane_f32(b.neon_f32, 0);
169  float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0);
170  /* the upper values in the result must be the remnants of <a>. */
171  r.neon_f32 = vaddq_f32(a.neon_f32, value);
172 #elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION)
173  r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32, simde_mm_add_ps(a, b).f32,
174  4, 1, 2, 3);
175 #else
176  r.f32[0] = a.f32[0] + b.f32[0];
177  r.f32[1] = a.f32[1];
178  r.f32[2] = a.f32[2];
179  r.f32[3] = a.f32[3];
180 #endif
181 
182  return r;
183 }
184 
187 {
188  simde__m128 r;
189 
190 #if defined(SIMDE_SSE_NATIVE)
191  r.n = _mm_and_ps(a.n, b.n);
192 #elif defined(SIMDE_SSE_NEON)
193  r.neon_i32 = vandq_s32(a.neon_i32, b.neon_i32);
194 #else
196  for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
197  r.i32[i] = a.i32[i] & b.i32[i];
198  }
199 #endif
200 
201  return r;
202 }
203 
206 {
207  simde__m128 r;
208 
209 #if defined(SIMDE_SSE_NATIVE)
210  r.n = _mm_andnot_ps(a.n, b.n);
211 #elif defined(SIMDE_SSE_NEON)
212  r.neon_i32 = vbicq_s32(b.neon_i32, a.neon_i32);
213 #else
215  for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
216  r.i32[i] = ~(a.i32[i]) & b.i32[i];
217  }
218 #endif
219 
220  return r;
221 }
222 
225 {
226  simde__m64 r;
227 
228 #if defined(SIMDE_SSE_NATIVE)
229  r.n = _mm_avg_pu16(a.n, b.n);
230 #elif defined(SIMDE_SSE_NEON)
231  r.neon_u16 = vrhadd_u16(b.neon_u16, a.neon_u16);
232 #else
234  for (size_t i = 0; i < 4; i++) {
235  r.u16[i] = (a.u16[i] + b.u16[i] + 1) >> 1;
236  }
237 #endif
238 
239  return r;
240 }
241 #define simde_m_pavgw(a, b) simde_mm_avg_pu16(a, b)
242 
245 {
246  simde__m64 r;
247 
248 #if defined(SIMDE_SSE_NATIVE)
249  r.n = _mm_avg_pu8(a.n, b.n);
250 #elif defined(SIMDE_SSE_NEON)
251  r.neon_u8 = vrhadd_u8(b.neon_u8, a.neon_u8);
252 #else
254  for (size_t i = 0; i < 8; i++) {
255  r.u8[i] = (a.u8[i] + b.u8[i] + 1) >> 1;
256  }
257 #endif
258 
259  return r;
260 }
261 #define simde_m_pavgb(a, b) simde_mm_avg_pu8(a, b)
262 
265 {
266  simde__m128 r;
267 
268 #if defined(SIMDE_SSE_NATIVE)
269  r.n = _mm_cmpeq_ps(a.n, b.n);
270 #elif defined(SIMDE_SSE_NEON)
271  r.neon_u32 = vceqq_f32(a.neon_f32, b.neon_f32);
272 #else
274  for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
275  r.u32[i] = (a.f32[i] == b.f32[i]) ? 0xffffffff : 0;
276  }
277 #endif
278 
279  return r;
280 }
281 
284 {
285  simde__m128 r;
286 
287 #if defined(SIMDE_SSE_NATIVE)
288  r.n = _mm_cmpeq_ss(a.n, b.n);
289 #elif defined(SIMDE_SSE_NEON)
290  float32x4_t s =
291  vreinterpretq_f32_u32(vceqq_f32(a.neon_f32, b.neon_f32));
292  float32x4_t t = vextq_f32(a.neon_f32, s, 1);
293  r.neon_f32 = vextq_f32(t, t, 3);
294 #elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION)
295  r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32,
296  simde_mm_cmpeq_ps(a, b).f32, 4, 1, 2, 3);
297 #else
298  r.u32[0] = (a.f32[0] == b.f32[0]) ? 0xffffffff : 0;
300  for (size_t i = 1; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
301  r.u32[i] = a.u32[i];
302  }
303 #endif
304 
305  return r;
306 }
307 
310 {
311  simde__m128 r;
312 
313 #if defined(SIMDE_SSE_NATIVE)
314  r.n = _mm_cmpge_ps(a.n, b.n);
315 #elif defined(SIMDE_SSE_NEON)
316  r.neon_u32 = vcgeq_f32(a.neon_f32, b.neon_f32);
317 #else
319  for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
320  r.u32[i] = (a.f32[i] >= b.f32[i]) ? 0xffffffff : 0;
321  }
322 #endif
323 
324  return r;
325 }
326 
329 {
330  simde__m128 r;
331 
332 #if defined(SIMDE_SSE_NATIVE) && !defined(__PGI)
333  r.n = _mm_cmpge_ss(a.n, b.n);
334 #elif defined(SIMDE_SSE_NEON)
335  float32x4_t s =
336  vreinterpretq_f32_u32(vcgeq_f32(a.neon_f32, b.neon_f32));
337  float32x4_t t = vextq_f32(a.neon_f32, s, 1);
338  r.neon_f32 = vextq_f32(t, t, 3);
339 #elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION)
340  r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32,
341  simde_mm_cmpge_ps(a, b).f32, 4, 1, 2, 3);
342 #else
343  r.u32[0] = (a.f32[0] >= b.f32[0]) ? 0xffffffff : 0;
345  for (size_t i = 1; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
346  r.u32[i] = a.u32[i];
347  }
348 #endif
349 
350  return r;
351 }
352 
355 {
356  simde__m128 r;
357 
358 #if defined(SIMDE_SSE_NATIVE)
359  r.n = _mm_cmpgt_ps(a.n, b.n);
360 #elif defined(SIMDE_SSE_NEON)
361  r.neon_u32 = vcgtq_f32(a.neon_f32, b.neon_f32);
362 #else
364  for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
365  r.u32[i] = (a.f32[i] > b.f32[i]) ? 0xffffffff : 0;
366  }
367 #endif
368 
369  return r;
370 }
371 
374 {
375  simde__m128 r;
376 
377 #if defined(SIMDE_SSE_NATIVE) && !defined(__PGI)
378  r.n = _mm_cmpgt_ss(a.n, b.n);
379 #elif defined(SIMDE_SSE_NEON)
380  float32x4_t s =
381  vreinterpretq_f32_u32(vcgtq_f32(a.neon_f32, b.neon_f32));
382  float32x4_t t = vextq_f32(a.neon_f32, s, 1);
383  r.neon_f32 = vextq_f32(t, t, 3);
384 #elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION)
385  r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32,
386  simde_mm_cmpgt_ps(a, b).f32, 4, 1, 2, 3);
387 #else
388  r.u32[0] = (a.f32[0] > b.f32[0]) ? 0xffffffff : 0;
390  for (size_t i = 1; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
391  r.u32[i] = a.u32[i];
392  }
393 #endif
394 
395  return r;
396 }
397 
400 {
401  simde__m128 r;
402 
403 #if defined(SIMDE_SSE_NATIVE)
404  r.n = _mm_cmple_ps(a.n, b.n);
405 #elif defined(SIMDE_SSE_NEON)
406  r.neon_u32 = vcleq_f32(a.neon_f32, b.neon_f32);
407 #else
409  for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
410  r.u32[i] = (a.f32[i] <= b.f32[i]) ? 0xffffffff : 0;
411  }
412 #endif
413 
414  return r;
415 }
416 
419 {
420  simde__m128 r;
421 
422 #if defined(SIMDE_SSE_NATIVE)
423  r.n = _mm_cmple_ss(a.n, b.n);
424 #elif defined(SIMDE_SSE_NEON)
425  float32x4_t s =
426  vreinterpretq_f32_u32(vcleq_f32(a.neon_f32, b.neon_f32));
427  float32x4_t t = vextq_f32(a.neon_f32, s, 1);
428  r.neon_f32 = vextq_f32(t, t, 3);
429 #elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION)
430  r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32,
431  simde_mm_cmple_ps(a, b).f32, 4, 1, 2, 3);
432 #else
433  r.u32[0] = (a.f32[0] <= b.f32[0]) ? 0xffffffff : 0;
435  for (size_t i = 1; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
436  r.u32[i] = a.u32[i];
437  }
438 #endif
439 
440  return r;
441 }
442 
445 {
446  simde__m128 r;
447 
448 #if defined(SIMDE_SSE_NATIVE)
449  r.n = _mm_cmplt_ps(a.n, b.n);
450 #elif defined(SIMDE_SSE_NEON)
451  r.neon_u32 = vcltq_f32(a.neon_f32, b.neon_f32);
452 #else
454  for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
455  r.u32[i] = (a.f32[i] < b.f32[i]) ? 0xffffffff : 0;
456  }
457 #endif
458 
459  return r;
460 }
461 
464 {
465  simde__m128 r;
466 
467 #if defined(SIMDE_SSE_NATIVE)
468  r.n = _mm_cmplt_ss(a.n, b.n);
469 #elif defined(SIMDE_SSE_NEON)
470  float32x4_t s =
471  vreinterpretq_f32_u32(vcltq_f32(a.neon_f32, b.neon_f32));
472  float32x4_t t = vextq_f32(a.neon_f32, s, 1);
473  r.neon_f32 = vextq_f32(t, t, 3);
474 #elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION)
475  r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32,
476  simde_mm_cmplt_ps(a, b).f32, 4, 1, 2, 3);
477 #else
478  r.u32[0] = (a.f32[0] < b.f32[0]) ? 0xffffffff : 0;
480  for (size_t i = 1; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
481  r.u32[i] = a.u32[i];
482  }
483 #endif
484 
485  return r;
486 }
487 
490 {
491  simde__m128 r;
492 
493 #if defined(SIMDE_SSE_NATIVE)
494  r.n = _mm_cmpneq_ps(a.n, b.n);
495 #elif defined(SIMDE_SSE_NEON)
496  r.neon_u32 = vmvnq_u32(vceqq_f32(a.neon_f32, b.neon_f32));
497 #else
499  for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
500  r.u32[i] = (a.f32[i] != b.f32[i]) ? 0xffffffff : 0;
501  }
502 #endif
503 
504  return r;
505 }
506 
509 {
510  simde__m128 r;
511 
512 #if defined(SIMDE_SSE_NATIVE)
513  r.n = _mm_cmpneq_ss(a.n, b.n);
514 #elif defined(SIMDE_SSE_NEON)
515  float32x4_t e =
516  vreinterpretq_f32_u32(vceqq_f32(a.neon_f32, b.neon_f32));
517  float32x4_t s =
518  vreinterpretq_f32_u32(vmvnq_u32(vreinterpretq_u32_f32(e)));
519  float32x4_t t = vextq_f32(a.neon_f32, s, 1);
520  r.neon_f32 = vextq_f32(t, t, 3);
521 #elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION)
522  r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32,
523  simde_mm_cmpneq_ps(a, b).f32, 4, 1, 2, 3);
524 #else
525  r.u32[0] = (a.f32[0] != b.f32[0]) ? 0xffffffff : 0;
527  for (size_t i = 1; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
528  r.u32[i] = a.u32[i];
529  }
530 #endif
531 
532  return r;
533 }
534 
537 {
538  simde__m128 r;
539 
540 #if defined(SIMDE_SSE_NATIVE)
541  r.n = _mm_cmpnge_ps(a.n, b.n);
542 #elif defined(SIMDE_SSE_NEON)
543  r.neon_u32 = vcltq_f32(a.neon_f32, b.neon_f32);
544 #else
545  r = simde_mm_cmplt_ps(a, b);
546 #endif
547 
548  return r;
549 }
550 
553 {
554  simde__m128 r;
555 
556 #if defined(SIMDE_SSE_NATIVE) && !defined(__PGI)
557  r.n = _mm_cmpnge_ss(a.n, b.n);
558 #elif defined(SIMDE_SSE_NEON)
559  float32x4_t s =
560  vreinterpretq_f32_u32(vcltq_f32(a.neon_f32, b.neon_f32));
561  float32x4_t t = vextq_f32(a.neon_f32, s, 1);
562  r.neon_f32 = vextq_f32(t, t, 3);
563 #else
564  r = simde_mm_cmplt_ss(a, b);
565 #endif
566 
567  return r;
568 }
569 
572 {
573  simde__m128 r;
574 
575 #if defined(SIMDE_SSE_NATIVE)
576  r.n = _mm_cmpngt_ps(a.n, b.n);
577 #elif defined(SIMDE_SSE_NEON)
578  r.neon_u32 = vcleq_f32(a.neon_f32, b.neon_f32);
579 #else
580  r = simde_mm_cmple_ps(a, b);
581 #endif
582 
583  return r;
584 }
585 
588 {
589  simde__m128 r;
590 
591 #if defined(SIMDE_SSE_NATIVE) && !defined(__PGI)
592  r.n = _mm_cmpngt_ss(a.n, b.n);
593 #elif defined(SIMDE_SSE_NEON)
594  float32x4_t s =
595  vreinterpretq_f32_u32(vcleq_f32(a.neon_f32, b.neon_f32));
596  float32x4_t t = vextq_f32(a.neon_f32, s, 1);
597  r.neon_f32 = vextq_f32(t, t, 3);
598 #else
599  r = simde_mm_cmple_ss(a, b);
600 #endif
601 
602  return r;
603 }
604 
607 {
608  simde__m128 r;
609 
610 #if defined(SIMDE_SSE_NATIVE)
611  r.n = _mm_cmpnle_ps(a.n, b.n);
612 #elif defined(SIMDE_SSE_NEON)
613  r.neon_u32 = vcgtq_f32(a.neon_f32, b.neon_f32);
614 #else
615  r = simde_mm_cmpgt_ps(a, b);
616 #endif
617 
618  return r;
619 }
620 
623 {
624  simde__m128 r;
625 
626 #if defined(SIMDE_SSE_NATIVE)
627  r.n = _mm_cmpnle_ss(a.n, b.n);
628 #elif defined(SIMDE_SSE_NEON)
629  float32x4_t s =
630  vreinterpretq_f32_u32(vcgtq_f32(a.neon_f32, b.neon_f32));
631  float32x4_t t = vextq_f32(a.neon_f32, s, 1);
632  r.neon_f32 = vextq_f32(t, t, 3);
633 #else
634  r = simde_mm_cmpgt_ss(a, b);
635 #endif
636 
637  return r;
638 }
639 
642 {
643  simde__m128 r;
644 
645 #if defined(SIMDE_SSE_NATIVE)
646  r.n = _mm_cmpnlt_ps(a.n, b.n);
647 #elif defined(SIMDE_SSE_NEON)
648  r.neon_u32 = vcgeq_f32(a.neon_f32, b.neon_f32);
649 #else
650  r = simde_mm_cmpge_ps(a, b);
651 #endif
652 
653  return r;
654 }
655 
658 {
659  simde__m128 r;
660 
661 #if defined(SIMDE_SSE_NATIVE)
662  r.n = _mm_cmpnlt_ss(a.n, b.n);
663 #else
664  r = simde_mm_cmpge_ss(a, b);
665 #endif
666 
667  return r;
668 }
669 
672 {
673  simde__m128 r;
674 
675 #if defined(SIMDE_SSE_NATIVE)
676  r.n = _mm_cmpord_ps(a.n, b.n);
677 #elif defined(SIMDE_SSE_NEON)
678  /* Note: NEON does not have ordered compare builtin
679  Need to compare a eq a and b eq b to check for NaN
680  Do AND of results to get final */
681  uint32x4_t ceqaa = vceqq_f32(a.neon_f32, a.neon_f32);
682  uint32x4_t ceqbb = vceqq_f32(b.neon_f32, b.neon_f32);
683  r.neon_u32 = vandq_u32(ceqaa, ceqbb);
684 #else
686  for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
687  r.u32[i] = (isnan(a.f32[i]) || isnan(b.f32[i])) ? 0
688  : 0xffffffff;
689  }
690 #endif
691 
692  return r;
693 }
694 
697 {
698  simde__m128 r;
699 
700 #if defined(SIMDE_SSE_NATIVE)
701  r.n = _mm_cmpord_ss(a.n, b.n);
702 #elif defined(SIMDE_SSE_NEON)
703  uint32x4_t ceqaa = vceqq_f32(a.neon_f32, a.neon_f32);
704  uint32x4_t ceqbb = vceqq_f32(b.neon_f32, b.neon_f32);
705  float32x4_t s = vreinterpretq_f32_u32(vandq_u32(ceqaa, ceqbb));
706  float32x4_t t = vextq_f32(a.neon_f32, s, 1);
707  r.neon_f32 = vextq_f32(t, t, 3);
708 #elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION)
709  r.f32 = SIMDE__SHUFFLE_VECTOR(32, 16, a.f32,
710  simde_mm_cmpord_ps(a, b).f32, 4, 1, 2, 3);
711 #else
712  r.u32[0] = (isnan(a.f32[0]) || isnan(b.f32[0])) ? 0 : 0xffffffff;
714  for (size_t i = 1; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
715  r.f32[i] = a.f32[i];
716  }
717 #endif
718 
719  return r;
720 }
721 
724 {
725  simde__m128 r;
726 
727 #if defined(SIMDE_SSE_NATIVE)
728  r.n = _mm_cmpunord_ps(a.n, b.n);
729 #else
731  for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
732  r.u32[i] = (isnan(a.f32[i]) || isnan(b.f32[i])) ? 0xffffffff
733  : 0;
734  }
735 #endif
736 
737  return r;
738 }
739 
742 {
743  simde__m128 r;
744 
745 #if defined(SIMDE_SSE_NATIVE) && !defined(__PGI)
746  r.n = _mm_cmpunord_ss(a.n, b.n);
747 #elif defined(SIMDE__SHUFFLE_VECTOR) && defined(SIMDE_ASSUME_VECTORIZATION)
748  r.f32 = SIMDE__SHUFFLE_VECTOR(
749  32, 16, a.f32, simde_mm_cmpunord_ps(a, b).f32, 4, 1, 2, 3);
750 #else
751  r.u32[0] = (isnan(a.f32[0]) || isnan(b.f32[0])) ? 0xffffffff : 0;
753  for (size_t i = 1; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
754  r.f32[i] = a.f32[i];
755  }
756 #endif
757 
758  return r;
759 }
760 
763 {
764 #if defined(SIMDE_SSE_NATIVE)
765  return _mm_comieq_ss(a.n, b.n);
766 #elif defined(SIMDE_SSE_NEON)
767  uint32x4_t a_not_nan = vceqq_f32(a.neon_f32, a.neon_f32);
768  uint32x4_t b_not_nan = vceqq_f32(b.neon_f32, b.neon_f32);
769  uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
770  uint32x4_t a_eq_b = vceqq_f32(a.neon_f32, b.neon_f32);
771  return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_eq_b), 0) != 0) ? 1 : 0;
772 #else
773  return a.f32[0] == b.f32[0];
774 #endif
775 }
776 
779 {
780 #if defined(SIMDE_SSE_NATIVE)
781  return _mm_comige_ss(a.n, b.n);
782 #elif defined(SIMDE_SSE_NEON)
783  uint32x4_t a_not_nan = vceqq_f32(a.neon_f32, a.neon_f32);
784  uint32x4_t b_not_nan = vceqq_f32(b.neon_f32, b.neon_f32);
785  uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
786  uint32x4_t a_ge_b = vcgeq_f32(a.neon_f32, b.neon_f32);
787  return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0) ? 1
788  : 0;
789 #else
790  return a.f32[0] >= b.f32[0];
791 #endif
792 }
793 
796 {
797 #if defined(SIMDE_SSE_NATIVE)
798  return _mm_comigt_ss(a.n, b.n);
799 #elif defined(SIMDE_SSE_NEON)
800  uint32x4_t a_not_nan = vceqq_f32(a.neon_f32, a.neon_f32);
801  uint32x4_t b_not_nan = vceqq_f32(b.neon_f32, b.neon_f32);
802  uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
803  uint32x4_t a_gt_b = vcgtq_f32(a.neon_f32, b.neon_f32);
804  return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0) ? 1
805  : 0;
806 #else
807  return a.f32[0] > b.f32[0];
808 #endif
809 }
810 
813 {
814 #if defined(SIMDE_SSE_NATIVE)
815  return _mm_comile_ss(a.n, b.n);
816 #elif defined(SIMDE_SSE_NEON)
817  uint32x4_t a_not_nan = vceqq_f32(a.neon_f32, a.neon_f32);
818  uint32x4_t b_not_nan = vceqq_f32(b.neon_f32, b.neon_f32);
819  uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
820  uint32x4_t a_le_b = vcleq_f32(a.neon_f32, b.neon_f32);
821  return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_le_b), 0) != 0) ? 1 : 0;
822 #else
823  return a.f32[0] <= b.f32[0];
824 #endif
825 }
826 
829 {
830 #if defined(SIMDE_SSE_NATIVE)
831  return _mm_comilt_ss(a.n, b.n);
832 #elif defined(SIMDE_SSE_NATIVE)
833  uint32x4_t a_not_nan = vceqq_f32(a.neon_f32, a.neon_f32);
834  uint32x4_t b_not_nan = vceqq_f32(b.neon_f32, b.neon_f32);
835  uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan));
836  uint32x4_t a_lt_b = vcltq_f32(a.neon_f32, b.neon_f32);
837  return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_lt_b), 0) != 0) ? 1 : 0;
838 #else
839  return a.f32[0] < b.f32[0];
840 #endif
841 }
842 
845 {
846 #if defined(SIMDE_SSE_NATIVE)
847  return _mm_comineq_ss(a.n, b.n);
848 #elif defined(SIMDE_SSE_NEON)
849  uint32x4_t a_not_nan = vceqq_f32(a.neon_f32, a.neon_f32);
850  uint32x4_t b_not_nan = vceqq_f32(b.neon_f32, b.neon_f32);
851  uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan);
852  uint32x4_t a_neq_b = vmvnq_u32(vceqq_f32(a.neon_f32, b.neon_f32));
853  return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_neq_b), 0) != 0)
854  ? 1
855  : 0;
856 #else
857  return a.f32[0] != b.f32[0];
858 #endif
859 }
860 
863 {
864  simde__m128 r;
865 
866 #if defined(SIMDE_SSE_NATIVE)
867  r.n = _mm_cvt_pi2ps(a.n, b.n);
868 #else
869  r.f32[0] = (simde_float32)b.i32[0];
870  r.f32[1] = (simde_float32)b.i32[1];
871  r.i32[2] = a.i32[2];
872  r.i32[3] = a.i32[3];
873 #endif
874 
875  return r;
876 }
877 
880 {
881  simde__m64 r;
882 
883 #if defined(SIMDE_SSE_NATIVE)
884  r.n = _mm_cvt_ps2pi(a.n);
885 #else
887  for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
888  r.i32[i] = (int32_t)a.f32[i];
889  }
890 #endif
891 
892  return r;
893 }
894 
897 {
898  simde__m128 r;
899 
900 #if defined(SIMDE_SSE_NATIVE)
901  r.n = _mm_cvt_si2ss(a.n, b);
902 #else
903  r.f32[0] = (simde_float32)b;
904  r.i32[1] = a.i32[1];
905  r.i32[2] = a.i32[2];
906  r.i32[3] = a.i32[3];
907 #endif
908 
909  return r;
910 }
911 
914 {
915 #if defined(SIMDE_SSE_NATIVE)
916  return _mm_cvt_ss2si(a.n);
917 #else
918  return (int32_t)a.f32[0];
919 #endif
920 }
921 
924 {
925  simde__m128 r;
926 
927 #if defined(SIMDE_SSE_NATIVE)
928  r.n = _mm_cvtpi16_ps(a.n);
929 #else
931  for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
932  r.f32[i] = (simde_float32)a.i16[i];
933  }
934 #endif
935 
936  return r;
937 }
938 
941 {
942  simde__m128 r;
943 
944 #if defined(SIMDE_SSE_NATIVE)
945  r.n = _mm_cvtpi32_ps(a.n, b.n);
946 #else
947  r.f32[0] = (simde_float32)b.i32[0];
948  r.f32[1] = (simde_float32)b.i32[1];
949  r.i32[2] = a.i32[2];
950  r.i32[3] = a.i32[3];
951 #endif
952 
953  return r;
954 }
955 
958 {
959  simde__m128 r;
960 
961 #if defined(SIMDE_SSE_NATIVE)
962  r.n = _mm_cvtpi32x2_ps(a.n, b.n);
963 #else
964  r.f32[0] = (simde_float32)a.i32[0];
965  r.f32[1] = (simde_float32)a.i32[1];
966  r.f32[2] = (simde_float32)b.i32[0];
967  r.f32[3] = (simde_float32)b.i32[1];
968 #endif
969 
970  return r;
971 }
972 
975 {
976  simde__m128 r;
977 
978 #if defined(SIMDE_SSE_NATIVE)
979  r.n = _mm_cvtpi8_ps(a.n);
980 #else
981  r.f32[0] = (simde_float32)a.i8[0];
982  r.f32[1] = (simde_float32)a.i8[1];
983  r.f32[2] = (simde_float32)a.i8[2];
984  r.f32[3] = (simde_float32)a.i8[3];
985 #endif
986 
987  return r;
988 }
989 
992 {
993  simde__m64 r;
994 
995 #if defined(SIMDE_SSE_NATIVE)
996  r.n = _mm_cvtps_pi16(a.n);
997 #else
999  for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
1000  r.i16[i] = (int16_t)a.f32[i];
1001  }
1002 #endif
1003 
1004  return r;
1005 }
1006 
1009 {
1010  simde__m64 r;
1011 
1012 #if defined(SIMDE_SSE_NATIVE)
1013  r.n = _mm_cvtps_pi32(a.n);
1014 #else
1016  for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
1017  r.i32[i] = (int32_t)a.f32[i];
1018  }
1019 #endif
1020 
1021  return r;
1022 }
1023 
1026 {
1027  simde__m64 r;
1028 
1029 #if defined(SIMDE_SSE_NATIVE)
1030  r.n = _mm_cvtps_pi8(a.n);
1031 #else
1033  for (size_t i = 0; i < (sizeof(a.f32) / sizeof(a.f32[0])); i++) {
1034  r.i8[i] = (int8_t)a.f32[i];
1035  }
1036 #endif
1037 
1038  return r;
1039 }
1040 
1043 {
1044  simde__m128 r;
1045 
1046 #if defined(SIMDE_SSE_NATIVE)
1047  r.n = _mm_cvtpu16_ps(a.n);
1048 #else
1050  for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
1051  r.f32[i] = (simde_float32)a.u16[i];
1052  }
1053 #endif
1054 
1055  return r;
1056 }
1057 
1060 {
1061  simde__m128 r;
1062 
1063 #if defined(SIMDE_SSE_NATIVE)
1064  r.n = _mm_cvtpu8_ps(a.n);
1065 #else
1067  for (size_t i = 0; i < 4; i++) {
1068  r.f32[i] = (simde_float32)a.u8[i];
1069  }
1070 #endif
1071 
1072  return r;
1073 }
1074 
1077 {
1078  simde__m128 r;
1079 
1080 #if defined(SIMDE_SSE_NATIVE)
1081  r.n = _mm_cvtsi32_ss(a.n, b);
1082 #else
1083  r.f32[0] = (simde_float32)b;
1085  for (size_t i = 1; i < 4; i++) {
1086  r.i32[i] = a.i32[i];
1087  }
1088 #endif
1089 
1090  return r;
1091 }
1092 
1095 {
1096  simde__m128 r;
1097 
1098 #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_AMD64)
1099 #if !defined(__PGI)
1100  r.n = _mm_cvtsi64_ss(a.n, b);
1101 #else
1102  r.n = _mm_cvtsi64x_ss(a.n, b);
1103 #endif
1104 #else
1105  r.f32[0] = (simde_float32)b;
1107  for (size_t i = 1; i < 4; i++) {
1108  r.i32[i] = a.i32[i];
1109  }
1110 #endif
1111 
1112  return r;
1113 }
1114 
1117 {
1118 #if defined(SIMDE_SSE_NATIVE)
1119  return _mm_cvtss_f32(a.n);
1120 #elif defined(SIMDE_SSE_NEON)
1121  return vgetq_lane_f32(a.neon_f32, 0);
1122 #else
1123  return a.f32[0];
1124 #endif
1125 }
1126 
1129 {
1130 #if defined(SIMDE_SSE_NATIVE)
1131  return _mm_cvtss_si32(a.n);
1132 #else
1133  return (int32_t)a.f32[0];
1134 #endif
1135 }
1136 
1139 {
1140 #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_AMD64)
1141 #if !defined(__PGI)
1142  return _mm_cvtss_si64(a.n);
1143 #else
1144  return _mm_cvtss_si64x(a.n);
1145 #endif
1146 #else
1147  return (int64_t)a.f32[0];
1148 #endif
1149 }
1150 
1153 {
1154  simde__m64 r;
1155 
1156 #if defined(SIMDE_SSE_NATIVE)
1157  r.n = _mm_cvtt_ps2pi(a.n);
1158 #else
1160  for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
1161  r.i32[i] = (int32_t)truncf(a.f32[i]);
1162  }
1163 #endif
1164 
1165  return r;
1166 }
1167 
1170 {
1171 #if defined(SIMDE_SSE_NATIVE)
1172  return _mm_cvtt_ss2si(a.n);
1173 #else
1174  return (int32_t)truncf(a.f32[0]);
1175 #endif
1176 }
1177 
1180 {
1181  simde__m64 r;
1182 
1183 #if defined(SIMDE_SSE_NATIVE)
1184  r.n = _mm_cvttps_pi32(a.n);
1185 #else
1186  r = simde_mm_cvtt_ps2pi(a);
1187 #endif
1188 
1189  return r;
1190 }
1191 
1194 {
1195 #if defined(SIMDE_SSE_NATIVE)
1196  return _mm_cvttss_si32(a.n);
1197 #else
1198  return (int32_t)truncf(a.f32[0]);
1199 #endif
1200 }
1201 
1204 {
1205 #if defined(SIMDE_SSE_NATIVE) && defined(SIMDE_ARCH_AMD64)
1206 #if defined(__PGI)
1207  return _mm_cvttss_si64x(a.n);
1208 #else
1209  return _mm_cvttss_si64(a.n);
1210 #endif
1211 #else
1212  return (int64_t)truncf(a.f32[0]);
1213 #endif
1214 }
1215 
1218 {
1219  simde__m128 r;
1220 
1221 #if defined(SIMDE_SSE_NATIVE)
1222  r.n = _mm_div_ps(a.n, b.n);
1223 #elif defined(SIMDE_SSE_NEON)
1224  float32x4_t recip0 = vrecpeq_f32(b.neon_f32);
1225  float32x4_t recip1 = vmulq_f32(recip0, vrecpsq_f32(recip0, b.neon_f32));
1226  r.neon_f32 = vmulq_f32(a.neon_f32, recip1);
1227 #else
1229  for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
1230  r.f32[i] = a.f32[i] / b.f32[i];
1231  }
1232 #endif
1233 
1234  return r;
1235 }
1236 
1239 {
1240  simde__m128 r;
1241 
1242 #if defined(SIMDE_SSE_NATIVE)
1243  r.n = _mm_div_ss(a.n, b.n);
1244 #elif defined(SIMDE_SSE_NEON)
1245  float32_t value = vgetq_lane_f32(simde_mm_div_ps(a, b).neon_f32, 0);
1246  r.neon_f32 = vsetq_lane_f32(value, a.neon_f32, 0);
1247 #else
1248  r.f32[0] = a.f32[0] / b.f32[0];
1250  for (size_t i = 1; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
1251  r.f32[i] = a.f32[i];
1252  }
1253 #endif
1254 
1255  return r;
1256 }
1257 
1259 int32_t simde_mm_extract_pi16(simde__m64 a, const int imm8)
1260 {
1261  return a.u16[imm8];
1262 }
1263 #if defined(SIMDE_SSE_NATIVE)
1264 #define simde_mm_extract_pi16(a, imm8) _mm_extract_pi16(a.n, imm8)
1265 #endif
1266 #define simde_m_pextrw(a, imm8) simde_mm_extract_pi16(a.n, imm8)
1267 
1268 enum {
1269 #if defined(SIMDE_SSE_NATIVE)
1270  simde_MM_ROUND_NEAREST = _MM_ROUND_NEAREST,
1271  simde_MM_ROUND_DOWN = _MM_ROUND_DOWN,
1272  simde_MM_ROUND_UP = _MM_ROUND_UP,
1273  simde_MM_ROUND_TOWARD_ZERO = _MM_ROUND_TOWARD_ZERO
1274 #else
1276 #if defined(FE_TONEAREST)
1277  = FE_TONEAREST
1278 #endif
1279  ,
1280 
1282 #if defined(FE_DOWNWARD)
1283  = FE_DOWNWARD
1284 #endif
1285  ,
1286 
1288 #if defined(FE_UPWARD)
1289  = FE_UPWARD
1290 #endif
1291  ,
1292 
1294 #if defined(FE_TOWARDZERO)
1295  = FE_TOWARDZERO
1296 #endif
1297 #endif
1298 };
1299 
1301 unsigned int simde_MM_GET_ROUNDING_MODE(void)
1302 {
1303 #if defined(SIMDE_SSE_NATIVE)
1304  return _MM_GET_ROUNDING_MODE();
1305 #else
1306  return fegetround();
1307 #endif
1308 }
1309 
1311 void simde_MM_SET_ROUNDING_MODE(unsigned int a)
1312 {
1313 #if defined(SIMDE_SSE_NATIVE)
1314  _MM_SET_ROUNDING_MODE(a);
1315 #else
1316  fesetround((int)a);
1317 #endif
1318 }
1319 
1321 simde__m64 simde_mm_insert_pi16(simde__m64 a, int16_t i, const int imm8)
1322 {
1323  simde__m64 r;
1324  r.i64[0] = a.i64[0];
1325  r.i16[imm8] = i;
1326  return r;
1327 }
1328 #if defined(SIMDE_SSE_NATIVE) && !defined(__PGI)
1329 #define simde_mm_insert_pi16(a, i, imm8) \
1330  SIMDE__M64_C(_mm_insert_pi16((a).n, i, imm8));
1331 #endif
1332 #define simde_m_pinsrw(a, i, imm8) \
1333  SIMDE__M64_C(simde_mm_insert_pi16((a).n, i, imm8));
1334 
1338 {
1339  simde__m128 r;
1340 
1341  simde_assert_aligned(16, mem_addr);
1342 
1343 #if defined(SIMDE_SSE_NATIVE)
1344  r.n = _mm_load_ps(mem_addr);
1345 #elif defined(SIMDE_SSE_NEON)
1346  r.neon_f32 = vld1q_f32(mem_addr);
1347 #else
1348  memcpy(&r, mem_addr, sizeof(r.f32));
1349 #endif
1350 
1351  return r;
1352 }
1353 
1356 {
1357  simde__m128 r;
1358 
1359 #if defined(SIMDE_SSE_NATIVE)
1360  r.n = _mm_load_ps1(mem_addr);
1361 #else
1362  const simde_float32 v = *mem_addr;
1364  for (size_t i = 0; i < (sizeof(r.i32) / sizeof(r.i32[0])); i++) {
1365  r.f32[i] = v;
1366  }
1367 #endif
1368 
1369  return r;
1370 }
1371 
1374 {
1375  simde__m128 r;
1376 
1377 #if defined(SIMDE_SSE_NATIVE)
1378  r.n = _mm_load_ss(mem_addr);
1379 #elif defined(SIMDE_SSE_NEON)
1380  r.neon_f32 = vsetq_lane_f32(*mem_addr, vdupq_n_f32(0), 0);
1381 #else
1382  r.f32[0] = *mem_addr;
1383  r.i32[1] = 0;
1384  r.i32[2] = 0;
1385  r.i32[3] = 0;
1386 #endif
1387 
1388  return r;
1389 }
1390 
1393 {
1394  simde__m128 r;
1395 
1396 #if defined(SIMDE_SSE_NATIVE)
1397  r.n = _mm_load1_ps(mem_addr);
1398 #elif defined(SIMDE_SSE_NEON)
1399  r.neon_f32 = vld1q_dup_f32(mem_addr);
1400 #else
1401  r = simde_mm_load_ps1(mem_addr);
1402 #endif
1403 
1404  return r;
1405 }
1406 
1409 {
1410  simde__m128 r;
1411 
1412 #if defined(SIMDE_SSE_NATIVE)
1413  r.n = _mm_loadh_pi(a.n, (__m64 *)mem_addr);
1414 #else
1415  r.f32[0] = a.f32[0];
1416  r.f32[1] = a.f32[1];
1417  r.f32[2] = mem_addr->f32[0];
1418  r.f32[3] = mem_addr->f32[1];
1419 #endif
1420 
1421  return r;
1422 }
1423 
1426 {
1427  simde__m128 r;
1428 
1429 #if defined(SIMDE_SSE_NATIVE)
1430  r.n = _mm_loadl_pi(a.n, (__m64 *)mem_addr);
1431 #else
1432  r.f32[0] = mem_addr->f32[0];
1433  r.f32[1] = mem_addr->f32[1];
1434  r.f32[2] = a.f32[2];
1435  r.f32[3] = a.f32[3];
1436 #endif
1437 
1438  return r;
1439 }
1440 
1444 {
1445  simde__m128 r;
1446 
1447  simde_assert_aligned(16, mem_addr);
1448 
1449 #if defined(SIMDE_SSE_NATIVE)
1450  r.n = _mm_loadr_ps(mem_addr);
1451 #else
1452  r.f32[0] = mem_addr[3];
1453  r.f32[1] = mem_addr[2];
1454  r.f32[2] = mem_addr[1];
1455  r.f32[3] = mem_addr[0];
1456 #endif
1457 
1458  return r;
1459 }
1460 
1464 {
1465  simde__m128 r;
1466 
1467 #if defined(SIMDE_SSE_NATIVE)
1468  r.n = _mm_loadu_ps(mem_addr);
1469 #elif defined(SIMDE_SSE_NEON)
1470  r.neon_f32 = vld1q_f32(mem_addr);
1471 #else
1472  r.f32[0] = mem_addr[0];
1473  r.f32[1] = mem_addr[1];
1474  r.f32[2] = mem_addr[2];
1475  r.f32[3] = mem_addr[3];
1476 #endif
1477 
1478  return r;
1479 }
1480 
1482 void simde_mm_maskmove_si64(simde__m64 a, simde__m64 mask, char *mem_addr)
1483 {
1484 #if defined(SIMDE_SSE_NATIVE)
1485  _mm_maskmove_si64(a.n, mask.n, mem_addr);
1486 #else
1488  for (size_t i = 0; i < (sizeof(a.i8) / sizeof(a.i8[0])); i++)
1489  if (mask.i8[i] < 0)
1490  mem_addr[i] = a.i8[i];
1491 #endif
1492 }
1493 #define simde_m_maskmovq(a, mask, mem_addr) \
1494  simde_mm_maskmove_si64(a, mask, mem_addr)
1495 
1498 {
1499  simde__m64 r;
1500 
1501 #if defined(SIMDE_SSE_NATIVE)
1502  r.n = _mm_max_pi16(a.n, b.n);
1503 #else
1505  for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
1506  r.i16[i] = (a.i16[i] > b.i16[i]) ? a.i16[i] : b.i16[i];
1507  }
1508 #endif
1509 
1510  return r;
1511 }
1512 #define simde_m_pmaxsw(a, b) simde_mm_max_pi16(a, b)
1513 
1516 {
1517  simde__m128 r;
1518 
1519 #if defined(SIMDE_SSE_NATIVE)
1520  r.n = _mm_max_ps(a.n, b.n);
1521 #elif defined(SIMDE_SSE_NEON)
1522  r.neon_f32 = vmaxq_f32(a.neon_f32, b.neon_f32);
1523 #else
1525  for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
1526  r.f32[i] = (a.f32[i] > b.f32[i]) ? a.f32[i] : b.f32[i];
1527  }
1528 #endif
1529 
1530  return r;
1531 }
1532 
1535 {
1536  simde__m64 r;
1537 
1538 #if defined(SIMDE_SSE_NATIVE)
1539  r.n = _mm_max_pu8(a.n, b.n);
1540 #else
1542  for (size_t i = 0; i < (sizeof(r.u8) / sizeof(r.u8[0])); i++) {
1543  r.u8[i] = (a.u8[i] > b.u8[i]) ? a.u8[i] : b.u8[i];
1544  }
1545 #endif
1546 
1547  return r;
1548 }
1549 #define simde_m_pmaxub(a, b) simde_mm_max_pu8(a, b)
1550 
1553 {
1554  simde__m128 r;
1555 
1556 #if defined(SIMDE_SSE_NATIVE)
1557  r.n = _mm_max_ss(a.n, b.n);
1558 #elif defined(SIMDE_SSE_NEON)
1559  float32_t value = vgetq_lane_f32(vmaxq_f32(a.neon_f32, b.neon_f32), 0);
1560  r.neon_f32 = vsetq_lane_f32(value, a.neon_f32, 0);
1561 #else
1562  r.f32[0] = (a.f32[0] > b.f32[0]) ? a.f32[0] : b.f32[0];
1563  r.f32[1] = a.f32[1];
1564  r.f32[2] = a.f32[2];
1565  r.f32[3] = a.f32[3];
1566 #endif
1567 
1568  return r;
1569 }
1570 
1573 {
1574  simde__m64 r;
1575 
1576 #if defined(SIMDE_SSE_NATIVE)
1577  r.n = _mm_min_pi16(a.n, b.n);
1578 #else
1580  for (size_t i = 0; i < (sizeof(r.i16) / sizeof(r.i16[0])); i++) {
1581  r.i16[i] = (a.i16[i] < b.i16[i]) ? a.i16[i] : b.i16[i];
1582  }
1583 #endif
1584 
1585  return r;
1586 }
1587 #define simde_m_pminsw(a, b) simde_mm_min_pi16(a, b)
1588 
1591 {
1592  simde__m128 r;
1593 
1594 #if defined(SIMDE_SSE_NATIVE)
1595  r.n = _mm_min_ps(a.n, b.n);
1596 #elif defined(SIMDE_SSE_NEON)
1597  r.neon_f32 = vminq_f32(a.neon_f32, b.neon_f32);
1598 #else
1600  for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
1601  r.f32[i] = (a.f32[i] < b.f32[i]) ? a.f32[i] : b.f32[i];
1602  }
1603 #endif
1604 
1605  return r;
1606 }
1607 
1610 {
1611  simde__m64 r;
1612 
1613 #if defined(SIMDE_SSE_NATIVE)
1614  r.n = _mm_min_pu8(a.n, b.n);
1615 #else
1617  for (size_t i = 0; i < (sizeof(r.u8) / sizeof(r.u8[0])); i++) {
1618  r.u8[i] = (a.u8[i] < b.u8[i]) ? a.u8[i] : b.u8[i];
1619  }
1620 #endif
1621 
1622  return r;
1623 }
1624 #define simde_m_pminub(a, b) simde_mm_min_pu8(a, b)
1625 
1628 {
1629  simde__m128 r;
1630 
1631 #if defined(SIMDE_SSE_NATIVE)
1632  r.n = _mm_min_ss(a.n, b.n);
1633 #elif defined(SIMDE_SSE_NEON)
1634  float32_t value = vgetq_lane_f32(vminq_f32(a.neon_f32, b.neon_f32), 0);
1635  r.neon_f32 = vsetq_lane_f32(value, a.neon_f32, 0);
1636 #else
1637  r.f32[0] = (a.f32[0] < b.f32[0]) ? a.f32[0] : b.f32[0];
1638  r.f32[1] = a.f32[1];
1639  r.f32[2] = a.f32[2];
1640  r.f32[3] = a.f32[3];
1641 #endif
1642 
1643  return r;
1644 }
1645 
1648 {
1649  simde__m128 r;
1650 
1651 #if defined(SIMDE_SSE_NATIVE)
1652  r.n = _mm_move_ss(a.n, b.n);
1653 #else
1654  r.f32[0] = b.f32[0];
1655  r.f32[1] = a.f32[1];
1656  r.f32[2] = a.f32[2];
1657  r.f32[3] = a.f32[3];
1658 #endif
1659 
1660  return r;
1661 }
1662 
1665 {
1666  simde__m128 r;
1667 
1668 #if defined(SIMDE_SSE_NATIVE)
1669  r.n = _mm_movehl_ps(a.n, b.n);
1670 #else
1671  r.f32[0] = b.f32[2];
1672  r.f32[1] = b.f32[3];
1673  r.f32[2] = a.f32[2];
1674  r.f32[3] = a.f32[3];
1675 #endif
1676 
1677  return r;
1678 }
1679 
1682 {
1683  simde__m128 r;
1684 
1685 #if defined(SIMDE_SSE_NATIVE)
1686  r.n = _mm_movelh_ps(a.n, b.n);
1687 #else
1688  r.f32[0] = a.f32[0];
1689  r.f32[1] = a.f32[1];
1690  r.f32[2] = b.f32[0];
1691  r.f32[3] = b.f32[1];
1692 #endif
1693 
1694  return r;
1695 }
1696 
1699 {
1700 #if defined(SIMDE_SSE_NATIVE)
1701  return _mm_movemask_pi8(a.n);
1702 #else
1703  int r = 0;
1704  const size_t nmemb = sizeof(a.i8) / sizeof(a.i8[0]);
1705 
1707  for (size_t i = 0; i < nmemb; i++) {
1708  r |= (a.u8[nmemb - 1 - i] >> 7) << (nmemb - 1 - i);
1709  }
1710 
1711  return r;
1712 #endif
1713 }
1714 #define simde_m_pmovmskb(a, b) simde_mm_movemask_pi8(a, b)
1715 
1718 {
1719 #if defined(SIMDE_SSE_NATIVE)
1720  return _mm_movemask_ps(a.n);
1721 #elif defined(SIMDE_SSE_NEON)
1722  /* TODO: check to see if NEON version is faster than the portable version */
1723  static const uint32x4_t movemask = {1, 2, 4, 8};
1724  static const uint32x4_t highbit = {0x80000000, 0x80000000, 0x80000000,
1725  0x80000000};
1726  uint32x4_t t0 = a.neon_u32;
1727  uint32x4_t t1 = vtstq_u32(t0, highbit);
1728  uint32x4_t t2 = vandq_u32(t1, movemask);
1729  uint32x2_t t3 = vorr_u32(vget_low_u32(t2), vget_high_u32(t2));
1730  return vget_lane_u32(t3, 0) | vget_lane_u32(t3, 1);
1731 #else
1732  int r = 0;
1733 
1735  for (size_t i = 0; i < sizeof(a.u32) / sizeof(a.u32[0]); i++) {
1736  r |= (a.u32[i] >> ((sizeof(a.u32[i]) * CHAR_BIT) - 1)) << i;
1737  }
1738 
1739  return r;
1740 #endif
1741 }
1742 
1745 {
1746  simde__m128 r;
1747 
1748 #if defined(SIMDE_SSE_NATIVE)
1749  r.n = _mm_mul_ps(a.n, b.n);
1750 #elif defined(SIMDE_SSE_NEON)
1751  r.neon_f32 = vmulq_f32(a.neon_f32, b.neon_f32);
1752 #else
1754  for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
1755  r.f32[i] = a.f32[i] * b.f32[i];
1756  }
1757 #endif
1758 
1759  return r;
1760 }
1761 
1764 {
1765  simde__m128 r;
1766 
1767 #if defined(SIMDE_SSE_NATIVE)
1768  r.n = _mm_mul_ss(a.n, b.n);
1769 #else
1770  r.f32[0] = a.f32[0] * b.f32[0];
1771  r.f32[1] = a.f32[1];
1772  r.f32[2] = a.f32[2];
1773  r.f32[3] = a.f32[3];
1774 #endif
1775 
1776  return r;
1777 }
1778 
1781 {
1782  simde__m64 r;
1783 
1784 #if defined(SIMDE_SSE_NATIVE)
1785  r.n = _mm_mulhi_pu16(a.n, b.n);
1786 #else
1788  for (size_t i = 0; i < (sizeof(r.u16) / sizeof(r.u16[0])); i++) {
1789  r.u16[i] = (a.u16[i] * b.u16[i]) >> 16;
1790  }
1791 #endif
1792 
1793  return r;
1794 }
1795 #define simde_m_pmulhuw(a, b) simde_mm_mulhi_pu16(a, b)
1796 
1799 {
1800  simde__m128 r;
1801 
1802 #if defined(SIMDE_SSE_NATIVE)
1803  r.n = _mm_or_ps(a.n, b.n);
1804 #elif defined(SIMDE_SSE_NEON)
1805  r.neon_i32 = vorrq_s32(a.neon_i32, b.neon_i32);
1806 #else
1808  for (size_t i = 0; i < (sizeof(r.u32) / sizeof(r.u32[0])); i++) {
1809  r.u32[i] = a.u32[i] | b.u32[i];
1810  }
1811 #endif
1812 
1813  return r;
1814 }
1815 
1817 void simde_mm_prefetch(char const *p, int i)
1818 {
1819  (void)p;
1820  (void)i;
1821 }
1822 #if defined(SIMDE_SSE_NATIVE)
1823 #define simde_mm_prefetch(p, i) _mm_prefetch(p, i)
1824 #endif
1825 
1828 {
1829  simde__m128 r;
1830 
1831 #if defined(SIMDE_SSE_NATIVE)
1832  r.n = _mm_rcp_ps(a.n);
1833 #elif defined(SIMDE_SSE_NEON)
1834  float32x4_t recip = vrecpeq_f32(a.neon_f32);
1835 
1836 #if !defined(SIMDE_MM_RCP_PS_ITERS)
1837 #define SIMDE_MM_RCP_PS_ITERS SIMDE_ACCURACY_ITERS
1838 #endif
1839 
1840  for (int i = 0; i < SIMDE_MM_RCP_PS_ITERS; ++i) {
1841  recip = vmulq_f32(recip, vrecpsq_f32(recip, a.neon_f32));
1842  }
1843 
1844  r.neon_f32 = recip;
1845 #else
1847  for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
1848  r.f32[i] = 1.0f / a.f32[i];
1849  }
1850 #endif
1851 
1852  return r;
1853 }
1854 
1857 {
1858  simde__m128 r;
1859 
1860 #if defined(SIMDE_SSE_NATIVE)
1861  r.n = _mm_rcp_ss(a.n);
1862 #else
1863  r.f32[0] = 1.0f / a.f32[0];
1864  r.f32[1] = a.f32[1];
1865  r.f32[2] = a.f32[2];
1866  r.f32[3] = a.f32[3];
1867 #endif
1868 
1869  return r;
1870 }
1871 
1874 {
1875  simde__m128 r;
1876 
1877 #if defined(SIMDE_SSE_NATIVE)
1878  r.n = _mm_rsqrt_ps(a.n);
1879 #elif defined(SIMDE_SSE_NEON)
1880  r.neon_f32 = vrsqrteq_f32(a.neon_f32);
1881 #elif defined(__STDC_IEC_559__)
1882  /* http://h14s.p5r.org/2012/09/0x5f3759df.html?mwh=1 */
1884  for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
1885  r.i32[i] = INT32_C(0x5f3759df) - (a.i32[i] >> 1);
1886 
1887 #if SIMDE_ACCURACY_ITERS > 2
1888  const float half = SIMDE_FLOAT32_C(0.5) * a.f32[i];
1889  for (int ai = 2; ai < SIMDE_ACCURACY_ITERS; ai++)
1890  r.f32[i] *= SIMDE_FLOAT32_C(1.5) -
1891  (half * r.f32[i] * r.f32[i]);
1892 #endif
1893  }
1894 #else
1896  for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
1897  r.f32[i] = 1.0f / sqrtf(a.f32[i]);
1898  }
1899 #endif
1900 
1901  return r;
1902 }
1903 
1906 {
1907  simde__m128 r;
1908 
1909 #if defined(SIMDE_SSE_NATIVE)
1910  r.n = _mm_rsqrt_ss(a.n);
1911 #elif defined(__STDC_IEC_559__)
1912  {
1913  r.i32[0] = INT32_C(0x5f3759df) - (a.i32[0] >> 1);
1914 
1915 #if SIMDE_ACCURACY_ITERS > 2
1916  float half = SIMDE_FLOAT32_C(0.5) * a.f32[0];
1917  for (int ai = 2; ai < SIMDE_ACCURACY_ITERS; ai++)
1918  r.f32[0] *= SIMDE_FLOAT32_C(1.5) -
1919  (half * r.f32[0] * r.f32[0]);
1920 #endif
1921  }
1922  r.f32[0] = 1.0f / sqrtf(a.f32[0]);
1923  r.f32[1] = a.f32[1];
1924  r.f32[2] = a.f32[2];
1925  r.f32[3] = a.f32[3];
1926 #else
1927  r.f32[0] = 1.0f / sqrtf(a.f32[0]);
1928  r.f32[1] = a.f32[1];
1929  r.f32[2] = a.f32[2];
1930  r.f32[3] = a.f32[3];
1931 #endif
1932 
1933  return r;
1934 }
1935 
1938 {
1939  simde__m64 r;
1940 
1941 #if defined(SIMDE_SSE_NATIVE)
1942  r.n = _mm_sad_pu8(a.n, b.n);
1943 #else
1944  uint16_t sum = 0;
1945 
1947  for (size_t i = 0; i < (sizeof(r.u8) / sizeof(r.u8[0])); i++) {
1948  sum += (uint8_t)abs(a.u8[i] - b.u8[i]);
1949  }
1950 
1951  r.i16[0] = sum;
1952  r.i16[1] = 0;
1953  r.i16[2] = 0;
1954  r.i16[3] = 0;
1955 #endif
1956 
1957  return r;
1958 }
1959 #define simde_m_psadbw(a, b) simde_mm_sad_pu8(a, b)
1960 
1964 {
1965  simde__m128 r;
1966 
1967 #if defined(SIMDE_SSE_NATIVE)
1968  r.n = _mm_set_ps(e3, e2, e1, e0);
1969 #elif defined(SIMDE_SSE_NEON)
1970  SIMDE_ALIGN(16) simde_float32 data[4] = {e0, e1, e2, e3};
1971  r.neon_f32 = vld1q_f32(data);
1972 #else
1973  r.f32[0] = e0;
1974  r.f32[1] = e1;
1975  r.f32[2] = e2;
1976  r.f32[3] = e3;
1977 #endif
1978 
1979  return r;
1980 }
1981 
1984 {
1985  simde__m128 r;
1986 
1987 #if defined(SIMDE_SSE_NATIVE)
1988  r.n = _mm_set1_ps(a);
1989 #elif defined(SIMDE_SSE_NEON)
1990  r.neon_f32 = vdupq_n_f32(a);
1991 #else
1992  r = simde_mm_set_ps(a, a, a, a);
1993 #endif
1994 
1995  return r;
1996 }
1997 #define simde_mm_set1_ps(a) simde_mm_set_ps1(a)
1998 
2001 {
2002  simde__m128 r;
2003 
2004 #if defined(SIMDE_SSE_NATIVE)
2005  r.n = _mm_set_ss(a);
2006 #else
2007  r = simde_mm_set_ps(0, 0, 0, a);
2008 #endif
2009 
2010  return r;
2011 }
2012 
2016 {
2017  simde__m128 r;
2018 
2019 #if defined(SIMDE_SSE_NATIVE)
2020  r.n = _mm_setr_ps(e3, e2, e1, e0);
2021 #elif defined(SIMDE_SSE_NEON)
2022  SIMDE_ALIGN(16) simde_float32 data[4] = {e3, e2, e1, e0};
2023  r.neon_f32 = vld1q_f32(data);
2024 #else
2025  r = simde_mm_set_ps(e0, e1, e2, e3);
2026 #endif
2027 
2028  return r;
2029 }
2030 
2033 {
2034  simde__m128 r;
2035 
2036 #if defined(SIMDE_SSE_NATIVE)
2037  r.n = _mm_setzero_ps();
2038 #elif defined(SIMDE_SSE_NEON)
2039  r.neon_f32 = vdupq_n_f32(0.0f);
2040 #else
2041  r = simde_mm_set_ps(0.0f, 0.0f, 0.0f, 0.0f);
2042 #endif
2043 
2044  return r;
2045 }
2046 
2049 {
2050  /* TODO: Use Hedley. */
2051 #if defined(SIMDE_SSE_NATIVE)
2052  _mm_sfence();
2053 #elif defined(__GNUC__) && \
2054  ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7))
2055  __atomic_thread_fence(__ATOMIC_SEQ_CST);
2056 #elif !defined(__INTEL_COMPILER) && defined(__STDC_VERSION__) && \
2057  (__STDC_VERSION__ >= 201112L) && !defined(__STDC_NO_ATOMICS__)
2058 #if defined(__GNUC__) && (__GNUC__ == 4) && (__GNUC_MINOR__ < 9)
2059  __atomic_thread_fence(__ATOMIC_SEQ_CST);
2060 #else
2061  atomic_thread_fence(memory_order_seq_cst);
2062 #endif
2063 #elif defined(_MSC_VER)
2064  MemoryBarrier();
2065 #elif defined(__GNUC__) && \
2066  ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 7))
2067  __atomic_thread_fence(__ATOMIC_SEQ_CST);
2068 #elif HEDLEY_CLANG_HAS_FEATURE(c_atomic)
2069  __c11_atomic_thread_fence(__ATOMIC_SEQ_CST)
2070 #elif defined(__GNUC__) && \
2071  ((__GNUC__ > 4) || (__GNUC__ == 4 && __GNUC_MINOR__ >= 1))
2072  __sync_synchronize();
2073 #elif (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x5140)) || \
2074  (defined(__SUNPRO_CC) && (__SUNPRO_CC >= 0x5140))
2075  __atomic_thread_fence(__ATOMIC_SEQ_CST);
2076 #elif defined(_OPENMP)
2077 #pragma omp critical(simde_mm_sfence_)
2078  {
2079  }
2080 #endif
2081 }
2082 
2083 #define SIMDE_MM_SHUFFLE(z, y, x, w) \
2084  (((z) << 6) | ((y) << 4) | ((x) << 2) | (w))
2085 
2088 {
2089  simde__m64 r;
2090  for (size_t i = 0; i < sizeof(r.u16) / sizeof(r.u16[0]); i++) {
2091  r.i16[i] = a.i16[(imm8 >> (i * 2)) & 3];
2092  }
2093  return r;
2094 }
2095 #if defined(SIMDE_SSE_NATIVE) && !defined(__PGI)
2096 #define simde_mm_shuffle_pi16(a, imm8) SIMDE__M64_C(_mm_shuffle_pi16(a.n, imm8))
2097 #elif defined(SIMDE__SHUFFLE_VECTOR)
2098 #define simde_mm_shuffle_pi16(a, imm8) \
2099  ({ \
2100  const simde__m64 simde__tmp_a_ = a; \
2101  (simde__m64){.i16 = SIMDE__SHUFFLE_VECTOR( \
2102  16, 8, (simde__tmp_a_).i16, \
2103  (simde__tmp_a_).i16, (((imm8)) & 3), \
2104  (((imm8) >> 2) & 3), (((imm8) >> 4) & 3), \
2105  (((imm8) >> 6) & 3))}; \
2106  })
2107 #endif
2108 
2109 #if defined(SIMDE_SSE_NATIVE) && !defined(__PGI)
2110 #define simde_m_pshufw(a, imm8) SIMDE__M64_C(_m_pshufw(a.n, imm8))
2111 #else
2112 #define simde_m_pshufw(a, imm8) simde_mm_shuffle_pi16(a, imm8)
2113 #endif
2114 
2117 {
2118  simde__m128 r;
2119  r.f32[0] = a.f32[(imm8 >> 0) & 3];
2120  r.f32[1] = a.f32[(imm8 >> 2) & 3];
2121  r.f32[2] = b.f32[(imm8 >> 4) & 3];
2122  r.f32[3] = b.f32[(imm8 >> 6) & 3];
2123  return r;
2124 }
2125 #if defined(SIMDE_SSE_NATIVE) && !defined(__PGI)
2126 #define simde_mm_shuffle_ps(a, b, imm8) \
2127  SIMDE__M128_C(_mm_shuffle_ps(a.n, b.n, imm8))
2128 #elif defined(SIMDE__SHUFFLE_VECTOR)
2129 #define simde_mm_shuffle_ps(a, b, imm8) \
2130  ({ \
2131  (simde__m128){.f32 = SIMDE__SHUFFLE_VECTOR( \
2132  32, 16, (a).f32, (b).f32, \
2133  (((imm8)) & 3), (((imm8) >> 2) & 3), \
2134  (((imm8) >> 4) & 3) + 4, \
2135  (((imm8) >> 6) & 3) + 4)}; \
2136  })
2137 #endif
2138 
2141 {
2142  simde__m128 r;
2143 
2144 #if defined(SIMDE_SSE_NATIVE)
2145  r.n = _mm_sqrt_ps(a.n);
2146 #elif defined(SIMDE_SSE_NEON)
2147  float32x4_t recipsq = vrsqrteq_f32(a.neon_f32);
2148  float32x4_t sq = vrecpeq_f32(recipsq);
2149  /* ??? use step versions of both sqrt and recip for better accuracy? */
2150  r.neon_f32 = sq;
2151 #else
2153  for (size_t i = 0; i < sizeof(r.f32) / sizeof(r.f32[0]); i++) {
2154  r.f32[i] = sqrtf(a.f32[i]);
2155  }
2156 #endif
2157 
2158  return r;
2159 }
2160 
2163 {
2164  simde__m128 r;
2165 
2166 #if defined(SIMDE_SSE_NATIVE)
2167  r.n = _mm_sqrt_ss(a.n);
2168 #elif defined(SIMDE_SSE_NEON)
2169  float32_t value = vgetq_lane_f32(simde_mm_sqrt_ps(a).neon_f32, 0);
2170  r.neon_f32 = vsetq_lane_f32(value, a.neon_f32, 0);
2171 #else
2172  r.f32[0] = sqrtf(a.f32[0]);
2173  r.f32[1] = a.f32[1];
2174  r.f32[2] = a.f32[2];
2175  r.f32[3] = a.f32[3];
2176 #endif
2177 
2178  return r;
2179 }
2180 
2183 {
2184  simde_assert_aligned(16, mem_addr);
2185 
2186 #if defined(SIMDE_SSE_NATIVE)
2187  _mm_store_ps(mem_addr, a.n);
2188 #elif defined(SIMDE_SSE_NEON)
2189  vst1q_f32(mem_addr, a.neon_f32);
2190 #else
2191  SIMDE__VECTORIZE_ALIGNED(mem_addr : 16)
2192  for (size_t i = 0; i < sizeof(a.f32) / sizeof(a.f32[0]); i++) {
2193  mem_addr[i] = a.f32[i];
2194  }
2195 #endif
2196 }
2197 
2200 {
2201  simde_assert_aligned(16, mem_addr);
2202 
2203 #if defined(SIMDE_SSE_NATIVE)
2204  _mm_store_ps1(mem_addr, a.n);
2205 #else
2206  SIMDE__VECTORIZE_ALIGNED(mem_addr : 16)
2207  for (size_t i = 0; i < sizeof(a.f32) / sizeof(a.f32[0]); i++) {
2208  mem_addr[i] = a.f32[0];
2209  }
2210 #endif
2211 }
2212 
2215 {
2216 #if defined(SIMDE_SSE_NATIVE)
2217  _mm_store_ss(mem_addr, a.n);
2218 #elif defined(SIMDE_SSE_NEON)
2219  vst1q_lane_f32(mem_addr, a.neon_f32, 0);
2220 #else
2221  *mem_addr = a.f32[0];
2222 #endif
2223 }
2224 
2227 {
2228  simde_assert_aligned(16, mem_addr);
2229 
2230 #if defined(SIMDE_SSE_NATIVE)
2231  _mm_store1_ps(mem_addr, a.n);
2232 #else
2233  simde_mm_store_ps1(mem_addr, a);
2234 #endif
2235 }
2236 
2239 {
2240 #if defined(SIMDE_SSE_NATIVE)
2241  _mm_storeh_pi(&(mem_addr->n), a.n);
2242 #else
2243  mem_addr->f32[0] = a.f32[2];
2244  mem_addr->f32[1] = a.f32[3];
2245 #endif
2246 }
2247 
2250 {
2251 #if defined(SIMDE_SSE_NATIVE)
2252  _mm_storel_pi(&(mem_addr->n), a.n);
2253 #else
2254  mem_addr->f32[0] = a.f32[0];
2255  mem_addr->f32[1] = a.f32[1];
2256 #endif
2257 }
2258 
2261 {
2262  simde_assert_aligned(16, mem_addr);
2263 
2264 #if defined(SIMDE_SSE_NATIVE)
2265  _mm_storer_ps(mem_addr, a.n);
2266 #else
2267  SIMDE__VECTORIZE_ALIGNED(mem_addr : 16)
2268  for (size_t i = 0; i < sizeof(a.f32) / sizeof(a.f32[0]); i++) {
2269  mem_addr[i] =
2270  a.f32[((sizeof(a.f32) / sizeof(a.f32[0])) - 1) - i];
2271  }
2272 #endif
2273 }
2274 
2277 {
2278 #if defined(SIMDE_SSE_NATIVE)
2279  _mm_storeu_ps(mem_addr, a.n);
2280 #elif defined(SIMDE_SSE_NEON)
2281  vst1q_f32(mem_addr, a.neon_f32);
2282 #else
2284  for (size_t i = 0; i < sizeof(a.f32) / sizeof(a.f32[0]); i++) {
2285  mem_addr[i] = a.f32[i];
2286  }
2287 #endif
2288 }
2289 
2292 {
2293  simde__m128 r;
2294 
2295 #if defined(SIMDE_SSE_NATIVE)
2296  r.n = _mm_sub_ps(a.n, b.n);
2297 #elif defined(SIMDE_SSE_NEON)
2298  r.neon_f32 = vsubq_f32(a.neon_f32, b.neon_f32);
2299 #else
2301  for (size_t i = 0; i < (sizeof(r.f32) / sizeof(r.f32[0])); i++) {
2302  r.f32[i] = a.f32[i] - b.f32[i];
2303  }
2304 #endif
2305 
2306  return r;
2307 }
2308 
2311 {
2312  simde__m128 r;
2313 
2314 #if defined(SIMDE_SSE_NATIVE)
2315  r.n = _mm_sub_ss(a.n, b.n);
2316 #else
2317  r.f32[0] = a.f32[0] - b.f32[0];
2318  r.f32[1] = a.f32[1];
2319  r.f32[2] = a.f32[2];
2320  r.f32[3] = a.f32[3];
2321 #endif
2322 
2323  return r;
2324 }
2325 
2328 {
2329 #if defined(SIMDE_SSE_NATIVE)
2330  return _mm_ucomieq_ss(a.n, b.n);
2331 #else
2332  fenv_t envp;
2333  int x = feholdexcept(&envp);
2334  int r = a.f32[0] == b.f32[0];
2335  if (HEDLEY_LIKELY(x == 0))
2336  fesetenv(&envp);
2337  return r;
2338 #endif
2339 }
2340 
2343 {
2344 #if defined(SIMDE_SSE_NATIVE)
2345  return _mm_ucomige_ss(a.n, b.n);
2346 #else
2347  fenv_t envp;
2348  int x = feholdexcept(&envp);
2349  int r = a.f32[0] >= b.f32[0];
2350  if (HEDLEY_LIKELY(x == 0))
2351  fesetenv(&envp);
2352  return r;
2353 #endif
2354 }
2355 
2358 {
2359 #if defined(SIMDE_SSE_NATIVE)
2360  return _mm_ucomigt_ss(a.n, b.n);
2361 #else
2362  fenv_t envp;
2363  int x = feholdexcept(&envp);
2364  int r = a.f32[0] > b.f32[0];
2365  if (HEDLEY_LIKELY(x == 0))
2366  fesetenv(&envp);
2367  return r;
2368 #endif
2369 }
2370 
2373 {
2374 #if defined(SIMDE_SSE_NATIVE)
2375  return _mm_ucomile_ss(a.n, b.n);
2376 #else
2377  fenv_t envp;
2378  int x = feholdexcept(&envp);
2379  int r = a.f32[0] <= b.f32[0];
2380  if (HEDLEY_LIKELY(x == 0))
2381  fesetenv(&envp);
2382  return r;
2383 #endif
2384 }
2385 
2388 {
2389 #if defined(SIMDE_SSE_NATIVE)
2390  return _mm_ucomilt_ss(a.n, b.n);
2391 #else
2392  fenv_t envp;
2393  int x = feholdexcept(&envp);
2394  int r = a.f32[0] < b.f32[0];
2395  if (HEDLEY_LIKELY(x == 0))
2396  fesetenv(&envp);
2397  return r;
2398 #endif
2399 }
2400 
2403 {
2404 #if defined(SIMDE_SSE_NATIVE)
2405  return _mm_ucomineq_ss(a.n, b.n);
2406 #else
2407  fenv_t envp;
2408  int x = feholdexcept(&envp);
2409  int r = a.f32[0] != b.f32[0];
2410  if (HEDLEY_LIKELY(x == 0))
2411  fesetenv(&envp);
2412  return r;
2413 #endif
2414 }
2415 
2416 #if defined(SIMDE_SSE_NATIVE)
2417 #if defined(__has_builtin)
2418 #if __has_builtin(__builtin_ia32_undef128)
2419 #define SIMDE__HAVE_UNDEFINED128
2420 #endif
2421 #elif !defined(__PGI) && !defined(SIMDE_BUG_GCC_REV_208793)
2422 #define SIMDE__HAVE_UNDEFINED128
2423 #endif
2424 #endif
2425 
2428 {
2429  simde__m128 r;
2430 
2431 #if defined(SIMDE__HAVE_UNDEFINED128)
2432  r.n = _mm_undefined_ps();
2433 #else
2434  r = simde_mm_setzero_ps();
2435 #endif
2436 
2437  return r;
2438 }
2439 
2442 {
2443  simde__m128 r;
2444 
2445 #if defined(SIMDE_SSE_NATIVE)
2446  r.n = _mm_unpackhi_ps(a.n, b.n);
2447 #elif defined(SIMDE_SSE_NEON)
2448  float32x2_t a1 = vget_high_f32(a.neon_f32);
2449  float32x2_t b1 = vget_high_f32(b.neon_f32);
2450  float32x2x2_t result = vzip_f32(a1, b1);
2451  r.neon_f32 = vcombine_f32(result.val[0], result.val[1]);
2452 #else
2453  r.f32[0] = a.f32[2];
2454  r.f32[1] = b.f32[2];
2455  r.f32[2] = a.f32[3];
2456  r.f32[3] = b.f32[3];
2457 #endif
2458 
2459  return r;
2460 }
2461 
2464 {
2465  simde__m128 r;
2466 
2467 #if defined(SIMDE_SSE_NATIVE)
2468  r.n = _mm_unpacklo_ps(a.n, b.n);
2469 #elif defined(SIMDE_SSE_NEON)
2470  float32x2_t a1 = vget_low_f32(a.neon_f32);
2471  float32x2_t b1 = vget_low_f32(b.neon_f32);
2472  float32x2x2_t result = vzip_f32(a1, b1);
2473  r.neon_f32 = vcombine_f32(result.val[0], result.val[1]);
2474 #else
2475  r.f32[0] = a.f32[0];
2476  r.f32[1] = b.f32[0];
2477  r.f32[2] = a.f32[1];
2478  r.f32[3] = b.f32[1];
2479 #endif
2480 
2481  return r;
2482 }
2483 
2486 {
2487  simde__m128 r;
2488 
2489 #if defined(SIMDE_SSE_NATIVE)
2490  r.n = _mm_xor_ps(a.n, b.n);
2491 #elif defined(SIMDE_SSE_NEON)
2492  r.neon_i32 = veorq_s32(a.neon_i32, b.neon_i32);
2493 #else
2495  for (size_t i = 0; i < (sizeof(r.u32) / sizeof(r.u32[0])); i++) {
2496  r.u32[i] = a.u32[i] ^ b.u32[i];
2497  }
2498 #endif
2499 
2500  return r;
2501 }
2502 
2505 {
2506 #if defined(SIMDE_SSE_NATIVE)
2507  _mm_stream_pi(&(mem_addr->n), a.n);
2508 #else
2509  mem_addr->i64[0] = a.i64[0];
2510 #endif
2511 }
2512 
2515 {
2516  simde_assert_aligned(16, mem_addr);
2517 
2518 #if defined(SIMDE_SSE_NATIVE)
2519  _mm_stream_ps(mem_addr, a.n);
2520 #else
2521  SIMDE__ASSUME_ALIGNED(mem_addr, 16);
2522  memcpy(mem_addr, &a, sizeof(a));
2523 #endif
2524 }
2525 
2527 uint32_t simde_mm_getcsr(void)
2528 {
2529 #if defined(SIMDE_SSE_NATIVE)
2530  return _mm_getcsr();
2531 #else
2532  uint32_t r = 0;
2533  int rounding_mode = fegetround();
2534 
2535  switch (rounding_mode) {
2536  case FE_TONEAREST:
2537  break;
2538  case FE_UPWARD:
2539  r |= 2 << 13;
2540  break;
2541  case FE_DOWNWARD:
2542  r |= 1 << 13;
2543  break;
2544  case FE_TOWARDZERO:
2545  r = 3 << 13;
2546  break;
2547  }
2548 
2549  return r;
2550 #endif
2551 }
2552 
2554 void simde_mm_setcsr(uint32_t a)
2555 {
2556 #if defined(SIMDE_SSE_NATIVE)
2557  _mm_setcsr(a);
2558 #else
2559  switch ((a >> 13) & 3) {
2560  case 0:
2561  fesetround(FE_TONEAREST);
2562  break;
2563  case 1:
2564  fesetround(FE_DOWNWARD);
2565  break;
2566  case 2:
2567  fesetround(FE_UPWARD);
2568  break;
2569  case 3:
2570  fesetround(FE_TOWARDZERO);
2571  break;
2572  }
2573 #endif
2574 }
2575 
2576 #define SIMDE_MM_TRANSPOSE4_PS(row0, row1, row2, row3) \
2577  do { \
2578  simde__m128 tmp3, tmp2, tmp1, tmp0; \
2579  tmp0 = simde_mm_unpacklo_ps((row0), (row1)); \
2580  tmp2 = simde_mm_unpacklo_ps((row2), (row3)); \
2581  tmp1 = simde_mm_unpackhi_ps((row0), (row1)); \
2582  tmp3 = simde_mm_unpackhi_ps((row2), (row3)); \
2583  row0 = simde_mm_movelh_ps(tmp0, tmp2); \
2584  row1 = simde_mm_movehl_ps(tmp2, tmp0); \
2585  row2 = simde_mm_movelh_ps(tmp1, tmp3); \
2586  row3 = simde_mm_movehl_ps(tmp3, tmp1); \
2587  } while (0)
2588 
2590 
2591 #endif /* !defined(SIMDE__SSE_H) */
SIMDE__FUNCTION_ATTRIBUTES simde__m64 simde_mm_max_pu8(simde__m64 a, simde__m64 b)
Definition: sse.h:1534
SIMDE__FUNCTION_ATTRIBUTES int32_t simde_mm_extract_pi16(simde__m64 a, const int imm8)
Definition: sse.h:1259
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_loadh_pi(simde__m128 a, simde__m64 const *mem_addr)
Definition: sse.h:1408
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cvtpi16_ps(simde__m64 a)
Definition: sse.h:923
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpge_ps(simde__m128 a, simde__m128 b)
Definition: sse.h:309
SIMDE_FLOAT32_TYPE simde_float32
Definition: simde-common.h:150
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cvtpu8_ps(simde__m64 a)
Definition: sse.h:1059
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_undefined_ps(void)
Definition: sse.h:2427
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cvtpu16_ps(simde__m64 a)
Definition: sse.h:1042
Definition: sse.h:1287
#define SIMDE__ASSUME_ALIGNED(ptr, align)
Definition: simde-common.h:251
SIMDE__FUNCTION_ATTRIBUTES int32_t simde_mm_cvttss_si32(simde__m128 a)
Definition: sse.h:1193
simde__m128
Definition: sse.h:124
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_store_ps1(simde_float32 mem_addr[4], simde__m128 a)
Definition: sse.h:2199
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpnge_ps(simde__m128 a, simde__m128 b)
Definition: sse.h:536
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpnge_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:552
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpngt_ps(simde__m128 a, simde__m128 b)
Definition: sse.h:571
#define HEDLEY_ARRAY_PARAM(name)
Definition: hedley.h:1309
SIMDE__FUNCTION_ATTRIBUTES void simde_MM_SET_ROUNDING_MODE(unsigned int a)
Definition: sse.h:1311
SIMDE__FUNCTION_ATTRIBUTES simde__m64 simde_mm_cvt_ps2pi(simde__m128 a)
Definition: sse.h:879
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_rsqrt_ps(simde__m128 a)
Definition: sse.h:1873
#define SIMDE__END_DECLS
Definition: simde-common.h:131
SIMDE__FUNCTION_ATTRIBUTES simde__m64 simde_mm_cvttps_pi32(simde__m128 a)
Definition: sse.h:1179
int16_t i16[4]
Definition: mmx.h:67
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_load_ps(simde_float32 const mem_addr[HEDLEY_ARRAY_PARAM(4)])
Definition: sse.h:1337
SIMDE__FUNCTION_ATTRIBUTES simde__m64 simde_mm_cvtps_pi8(simde__m128 a)
Definition: sse.h:1025
#define SIMDE__VECTORIZE_REDUCTION(r)
Definition: simde-common.h:100
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_unpacklo_ps(simde__m128 a, simde__m128 b)
Definition: sse.h:2463
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cvtsi64_ss(simde__m128 a, int64_t b)
Definition: sse.h:1094
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_load_ss(simde_float32 const *mem_addr)
Definition: sse.h:1373
SIMDE__FUNCTION_ATTRIBUTES int simde_mm_ucomigt_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:2357
SIMDE__FUNCTION_ATTRIBUTES int simde_mm_movemask_pi8(simde__m64 a)
Definition: sse.h:1698
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_stream_ps(simde_float32 mem_addr[4], simde__m128 a)
Definition: sse.h:2514
SIMDE__FUNCTION_ATTRIBUTES simde__m64 simde_mm_max_pi16(simde__m64 a, simde__m64 b)
Definition: sse.h:1497
int64_t i64[1]
Definition: mmx.h:69
Definition: sse.h:1281
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_movehl_ps(simde__m128 a, simde__m128 b)
Definition: sse.h:1664
SIMDE__FUNCTION_ATTRIBUTES uint32_t simde_mm_getcsr(void)
Definition: sse.h:2527
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_storeh_pi(simde__m64 *mem_addr, simde__m128 a)
Definition: sse.h:2238
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_andnot_ps(simde__m128 a, simde__m128 b)
Definition: sse.h:205
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_add_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:161
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpord_ps(simde__m128 a, simde__m128 b)
Definition: sse.h:671
SIMDE__FUNCTION_ATTRIBUTES int32_t simde_mm_cvtt_ss2si(simde__m128 a)
Definition: sse.h:1169
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_loadu_ps(simde_float32 const mem_addr[HEDLEY_ARRAY_PARAM(4)])
Definition: sse.h:1463
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpngt_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:587
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_setr_ps(simde_float32 e3, simde_float32 e2, simde_float32 e1, simde_float32 e0)
Definition: sse.h:2014
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cvtpi32_ps(simde__m128 a, simde__m64 b)
Definition: sse.h:940
SIMDE__FUNCTION_ATTRIBUTES int simde_mm_comilt_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:828
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpgt_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:373
SIMDE__FUNCTION_ATTRIBUTES int simde_mm_ucomieq_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:2327
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpge_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:328
Definition: half.h:49
simde_float32 f32[2]
Definition: mmx.h:74
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_rcp_ps(simde__m128 a)
Definition: sse.h:1827
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_mul_ps(simde__m128 a, simde__m128 b)
Definition: sse.h:1744
#define HEDLEY_LIKELY(expr)
Definition: hedley.h:1065
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_or_ps(simde__m128 a, simde__m128 b)
Definition: sse.h:1798
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_storer_ps(simde_float32 mem_addr[4], simde__m128 a)
Definition: sse.h:2260
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpneq_ps(simde__m128 a, simde__m128 b)
Definition: sse.h:489
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_rsqrt_ss(simde__m128 a)
Definition: sse.h:1905
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_prefetch(char const *p, int i)
Definition: sse.h:1817
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_maskmove_si64(simde__m64 a, simde__m64 mask, char *mem_addr)
Definition: sse.h:1482
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_store1_ps(simde_float32 mem_addr[4], simde__m128 a)
Definition: sse.h:2226
SIMDE__FUNCTION_ATTRIBUTES int simde_mm_ucomile_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:2372
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_loadr_ps(simde_float32 const mem_addr[HEDLEY_ARRAY_PARAM(4)])
Definition: sse.h:1443
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_min_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:1627
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_rcp_ss(simde__m128 a)
Definition: sse.h:1856
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_sfence(void)
Definition: sse.h:2048
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpneq_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:508
SIMDE__FUNCTION_ATTRIBUTES int simde_mm_comineq_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:844
SIMDE__FUNCTION_ATTRIBUTES simde__m64 simde_mm_min_pi16(simde__m64 a, simde__m64 b)
Definition: sse.h:1572
Definition: mmx.h:54
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cvtpi32x2_ps(simde__m64 a, simde__m64 b)
Definition: sse.h:957
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_move_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:1647
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_setzero_ps(void)
Definition: sse.h:2032
SIMDE__FUNCTION_ATTRIBUTES simde__m64 simde_mm_cvtps_pi32(simde__m128 a)
Definition: sse.h:1008
SIMDE__FUNCTION_ATTRIBUTES unsigned int simde_MM_GET_ROUNDING_MODE(void)
Definition: sse.h:1301
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_set_ps1(simde_float32 a)
Definition: sse.h:1983
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_add_ps(simde__m128 a, simde__m128 b)
Definition: sse.h:142
SIMDE__FUNCTION_ATTRIBUTES simde__m64 simde_mm_sad_pu8(simde__m64 a, simde__m64 b)
Definition: sse.h:1937
int32_t i32[2]
Definition: mmx.h:68
SIMDE__FUNCTION_ATTRIBUTES simde__m64 simde_mm_avg_pu16(simde__m64 a, simde__m64 b)
Definition: sse.h:224
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmple_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:418
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_storel_pi(simde__m64 *mem_addr, simde__m128 a)
Definition: sse.h:2249
SIMDE__FUNCTION_ATTRIBUTES int64_t simde_mm_cvtss_si64(simde__m128 a)
Definition: sse.h:1138
SIMDE__FUNCTION_ATTRIBUTES int simde_mm_comile_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:812
#define SIMDE__BEGIN_DECLS
Definition: simde-common.h:130
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_stream_pi(simde__m64 *mem_addr, simde__m64 a)
Definition: sse.h:2504
SIMDE__FUNCTION_ATTRIBUTES int simde_mm_comige_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:778
#define SIMDE__FUNCTION_ATTRIBUTES
Definition: simde-common.h:121
#define SIMDE_FLOAT32_C(value)
Definition: simde-common.h:146
SIMDE__FUNCTION_ATTRIBUTES simde__m64 simde_mm_mulhi_pu16(simde__m64 a, simde__m64 b)
Definition: sse.h:1780
uint8_t u8[8]
Definition: mmx.h:70
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_max_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:1552
#define SIMDE__VECTORIZE_ALIGNED(a)
Definition: simde-common.h:101
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_movelh_ps(simde__m128 a, simde__m128 b)
Definition: sse.h:1681
SIMDE__FUNCTION_ATTRIBUTES simde__m64 simde_mm_cvtps_pi16(simde__m128 a)
Definition: sse.h:991
SIMDE__FUNCTION_ATTRIBUTES int64_t simde_mm_cvttss_si64(simde__m128 a)
Definition: sse.h:1203
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpgt_ps(simde__m128 a, simde__m128 b)
Definition: sse.h:354
SIMDE__FUNCTION_ATTRIBUTES int simde_mm_ucomige_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:2342
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpord_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:696
SIMDE__FUNCTION_ATTRIBUTES int simde_mm_movemask_ps(simde__m128 a)
Definition: sse.h:1717
#define SIMDE_ACCURACY_ITERS
Definition: simde-common.h:216
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_div_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:1238
SIMDE__FUNCTION_ATTRIBUTES int simde_mm_ucomineq_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:2402
#define SIMDE_ALIGN(alignment)
Definition: sse.h:77
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_min_ps(simde__m128 a, simde__m128 b)
Definition: sse.h:1590
#define simde_assert_aligned(alignment, val)
Definition: simde-common.h:50
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpnle_ps(simde__m128 a, simde__m128 b)
Definition: sse.h:606
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_sqrt_ps(simde__m128 a)
Definition: sse.h:2140
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_unpackhi_ps(simde__m128 a, simde__m128 b)
Definition: sse.h:2441
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmplt_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:463
SIMDE__FUNCTION_ATTRIBUTES simde_float32 simde_mm_cvtss_f32(simde__m128 a)
Definition: sse.h:1116
SIMDE__FUNCTION_ATTRIBUTES int simde_mm_comieq_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:762
SIMDE__FUNCTION_ATTRIBUTES simde__m64 simde_mm_cvtt_ps2pi(simde__m128 a)
Definition: sse.h:1152
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_store_ss(simde_float32 *mem_addr, simde__m128 a)
Definition: sse.h:2214
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_div_ps(simde__m128 a, simde__m128 b)
Definition: sse.h:1217
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_sub_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:2310
Definition: sse.h:1275
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_and_ps(simde__m128 a, simde__m128 b)
Definition: sse.h:186
uint16_t u16[4]
Definition: mmx.h:71
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_storeu_ps(simde_float32 mem_addr[4], simde__m128 a)
Definition: sse.h:2276
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_load_ps1(simde_float32 const *mem_addr)
Definition: sse.h:1355
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpnlt_ps(simde__m128 a, simde__m128 b)
Definition: sse.h:641
HEDLEY_STATIC_ASSERT(16==sizeof(simde__m128), "simde__m128 size incorrect")
SIMDE__FUNCTION_ATTRIBUTES int32_t simde_mm_cvt_ss2si(simde__m128 a)
Definition: sse.h:913
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cvtpi8_ps(simde__m64 a)
Definition: sse.h:974
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmple_ps(simde__m128 a, simde__m128 b)
Definition: sse.h:399
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cvtsi32_ss(simde__m128 a, int32_t b)
Definition: sse.h:1076
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_mul_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:1763
SIMDE__FUNCTION_ATTRIBUTES int32_t simde_mm_cvtss_si32(simde__m128 a)
Definition: sse.h:1128
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpunord_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:741
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cvt_si2ss(simde__m128 a, int32_t b)
Definition: sse.h:896
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cvt_pi2ps(simde__m128 a, simde__m64 b)
Definition: sse.h:862
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_set_ps(simde_float32 e3, simde_float32 e2, simde_float32 e1, simde_float32 e0)
Definition: sse.h:1962
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_max_ps(simde__m128 a, simde__m128 b)
Definition: sse.h:1515
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpnle_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:622
#define SIMDE__VECTORIZE
Definition: simde-common.h:98
SIMDE__FUNCTION_ATTRIBUTES simde__m64 simde_mm_insert_pi16(simde__m64 a, int16_t i, const int imm8)
Definition: sse.h:1321
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmplt_ps(simde__m128 a, simde__m128 b)
Definition: sse.h:444
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_shuffle_ps(simde__m128 a, simde__m128 b, const int imm8)
Definition: sse.h:2116
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_setcsr(uint32_t a)
Definition: sse.h:2554
SIMDE__FUNCTION_ATTRIBUTES int simde_mm_ucomilt_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:2387
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_sqrt_ss(simde__m128 a)
Definition: sse.h:2162
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpnlt_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:657
SIMDE__FUNCTION_ATTRIBUTES int simde_mm_comigt_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:795
SIMDE__FUNCTION_ATTRIBUTES simde__m64 simde_mm_avg_pu8(simde__m64 a, simde__m64 b)
Definition: sse.h:244
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_set_ss(simde_float32 a)
Definition: sse.h:2000
SIMDE__FUNCTION_ATTRIBUTES simde__m64 simde_mm_shuffle_pi16(simde__m64 a, const int imm8)
Definition: sse.h:2087
SIMDE__FUNCTION_ATTRIBUTES simde__m64 simde_mm_min_pu8(simde__m64 a, simde__m64 b)
Definition: sse.h:1609
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_sub_ps(simde__m128 a, simde__m128 b)
Definition: sse.h:2291
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpunord_ps(simde__m128 a, simde__m128 b)
Definition: sse.h:723
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_xor_ps(simde__m128 a, simde__m128 b)
Definition: sse.h:2485
int8_t i8[8]
Definition: mmx.h:66
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_loadl_pi(simde__m128 a, simde__m64 const *mem_addr)
Definition: sse.h:1425
SIMDE__FUNCTION_ATTRIBUTES void simde_mm_store_ps(simde_float32 mem_addr[4], simde__m128 a)
Definition: sse.h:2182
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_load1_ps(simde_float32 const *mem_addr)
Definition: sse.h:1392
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpeq_ss(simde__m128 a, simde__m128 b)
Definition: sse.h:283
SIMDE__FUNCTION_ATTRIBUTES simde__m128 simde_mm_cmpeq_ps(simde__m128 a, simde__m128 b)
Definition: sse.h:264
Definition: sse.h:1293