core/stdarch/crates/core_arch/src/x86/
sse.rs

1//! Streaming SIMD Extensions (SSE)
2
3use crate::{
4    core_arch::{simd::*, x86::*},
5    intrinsics::simd::*,
6    intrinsics::sqrtf32,
7    mem, ptr,
8};
9
10#[cfg(test)]
11use stdarch_test::assert_instr;
12
13/// Adds the first component of `a` and `b`, the other components are copied
14/// from `a`.
15///
16/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ss)
17#[inline]
18#[target_feature(enable = "sse")]
19#[cfg_attr(test, assert_instr(addss))]
20#[stable(feature = "simd_x86", since = "1.27.0")]
21pub unsafe fn _mm_add_ss(a: __m128, b: __m128) -> __m128 {
22    simd_insert!(a, 0, _mm_cvtss_f32(a) + _mm_cvtss_f32(b))
23}
24
25/// Adds packed single-precision (32-bit) floating-point elements in `a` and
26/// `b`.
27///
28/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ps)
29#[inline]
30#[target_feature(enable = "sse")]
31#[cfg_attr(test, assert_instr(addps))]
32#[stable(feature = "simd_x86", since = "1.27.0")]
33pub unsafe fn _mm_add_ps(a: __m128, b: __m128) -> __m128 {
34    simd_add(a, b)
35}
36
37/// Subtracts the first component of `b` from `a`, the other components are
38/// copied from `a`.
39///
40/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ss)
41#[inline]
42#[target_feature(enable = "sse")]
43#[cfg_attr(test, assert_instr(subss))]
44#[stable(feature = "simd_x86", since = "1.27.0")]
45pub unsafe fn _mm_sub_ss(a: __m128, b: __m128) -> __m128 {
46    simd_insert!(a, 0, _mm_cvtss_f32(a) - _mm_cvtss_f32(b))
47}
48
49/// Subtracts packed single-precision (32-bit) floating-point elements in `a` and
50/// `b`.
51///
52/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ps)
53#[inline]
54#[target_feature(enable = "sse")]
55#[cfg_attr(test, assert_instr(subps))]
56#[stable(feature = "simd_x86", since = "1.27.0")]
57pub unsafe fn _mm_sub_ps(a: __m128, b: __m128) -> __m128 {
58    simd_sub(a, b)
59}
60
61/// Multiplies the first component of `a` and `b`, the other components are
62/// copied from `a`.
63///
64/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ss)
65#[inline]
66#[target_feature(enable = "sse")]
67#[cfg_attr(test, assert_instr(mulss))]
68#[stable(feature = "simd_x86", since = "1.27.0")]
69pub unsafe fn _mm_mul_ss(a: __m128, b: __m128) -> __m128 {
70    simd_insert!(a, 0, _mm_cvtss_f32(a) * _mm_cvtss_f32(b))
71}
72
73/// Multiplies packed single-precision (32-bit) floating-point elements in `a` and
74/// `b`.
75///
76/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ps)
77#[inline]
78#[target_feature(enable = "sse")]
79#[cfg_attr(test, assert_instr(mulps))]
80#[stable(feature = "simd_x86", since = "1.27.0")]
81pub unsafe fn _mm_mul_ps(a: __m128, b: __m128) -> __m128 {
82    simd_mul(a, b)
83}
84
85/// Divides the first component of `b` by `a`, the other components are
86/// copied from `a`.
87///
88/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ss)
89#[inline]
90#[target_feature(enable = "sse")]
91#[cfg_attr(test, assert_instr(divss))]
92#[stable(feature = "simd_x86", since = "1.27.0")]
93pub unsafe fn _mm_div_ss(a: __m128, b: __m128) -> __m128 {
94    simd_insert!(a, 0, _mm_cvtss_f32(a) / _mm_cvtss_f32(b))
95}
96
97/// Divides packed single-precision (32-bit) floating-point elements in `a` and
98/// `b`.
99///
100/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ps)
101#[inline]
102#[target_feature(enable = "sse")]
103#[cfg_attr(test, assert_instr(divps))]
104#[stable(feature = "simd_x86", since = "1.27.0")]
105pub unsafe fn _mm_div_ps(a: __m128, b: __m128) -> __m128 {
106    simd_div(a, b)
107}
108
109/// Returns the square root of the first single-precision (32-bit)
110/// floating-point element in `a`, the other elements are unchanged.
111///
112/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ss)
113#[inline]
114#[target_feature(enable = "sse")]
115#[cfg_attr(test, assert_instr(sqrtss))]
116#[stable(feature = "simd_x86", since = "1.27.0")]
117pub unsafe fn _mm_sqrt_ss(a: __m128) -> __m128 {
118    simd_insert!(a, 0, sqrtf32(_mm_cvtss_f32(a)))
119}
120
121/// Returns the square root of packed single-precision (32-bit) floating-point
122/// elements in `a`.
123///
124/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ps)
125#[inline]
126#[target_feature(enable = "sse")]
127#[cfg_attr(test, assert_instr(sqrtps))]
128#[stable(feature = "simd_x86", since = "1.27.0")]
129pub unsafe fn _mm_sqrt_ps(a: __m128) -> __m128 {
130    simd_fsqrt(a)
131}
132
133/// Returns the approximate reciprocal of the first single-precision
134/// (32-bit) floating-point element in `a`, the other elements are unchanged.
135///
136/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ss)
137#[inline]
138#[target_feature(enable = "sse")]
139#[cfg_attr(test, assert_instr(rcpss))]
140#[stable(feature = "simd_x86", since = "1.27.0")]
141pub unsafe fn _mm_rcp_ss(a: __m128) -> __m128 {
142    rcpss(a)
143}
144
145/// Returns the approximate reciprocal of packed single-precision (32-bit)
146/// floating-point elements in `a`.
147///
148/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ps)
149#[inline]
150#[target_feature(enable = "sse")]
151#[cfg_attr(test, assert_instr(rcpps))]
152#[stable(feature = "simd_x86", since = "1.27.0")]
153pub unsafe fn _mm_rcp_ps(a: __m128) -> __m128 {
154    rcpps(a)
155}
156
157/// Returns the approximate reciprocal square root of the first single-precision
158/// (32-bit) floating-point element in `a`, the other elements are unchanged.
159///
160/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ss)
161#[inline]
162#[target_feature(enable = "sse")]
163#[cfg_attr(test, assert_instr(rsqrtss))]
164#[stable(feature = "simd_x86", since = "1.27.0")]
165pub unsafe fn _mm_rsqrt_ss(a: __m128) -> __m128 {
166    rsqrtss(a)
167}
168
169/// Returns the approximate reciprocal square root of packed single-precision
170/// (32-bit) floating-point elements in `a`.
171///
172/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ps)
173#[inline]
174#[target_feature(enable = "sse")]
175#[cfg_attr(test, assert_instr(rsqrtps))]
176#[stable(feature = "simd_x86", since = "1.27.0")]
177pub unsafe fn _mm_rsqrt_ps(a: __m128) -> __m128 {
178    rsqrtps(a)
179}
180
181/// Compares the first single-precision (32-bit) floating-point element of `a`
182/// and `b`, and return the minimum value in the first element of the return
183/// value, the other elements are copied from `a`.
184///
185/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ss)
186#[inline]
187#[target_feature(enable = "sse")]
188#[cfg_attr(test, assert_instr(minss))]
189#[stable(feature = "simd_x86", since = "1.27.0")]
190pub unsafe fn _mm_min_ss(a: __m128, b: __m128) -> __m128 {
191    minss(a, b)
192}
193
194/// Compares packed single-precision (32-bit) floating-point elements in `a` and
195/// `b`, and return the corresponding minimum values.
196///
197/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ps)
198#[inline]
199#[target_feature(enable = "sse")]
200#[cfg_attr(test, assert_instr(minps))]
201#[stable(feature = "simd_x86", since = "1.27.0")]
202pub unsafe fn _mm_min_ps(a: __m128, b: __m128) -> __m128 {
203    // See the `test_mm_min_ps` test why this can't be implemented using `simd_fmin`.
204    minps(a, b)
205}
206
207/// Compares the first single-precision (32-bit) floating-point element of `a`
208/// and `b`, and return the maximum value in the first element of the return
209/// value, the other elements are copied from `a`.
210///
211/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ss)
212#[inline]
213#[target_feature(enable = "sse")]
214#[cfg_attr(test, assert_instr(maxss))]
215#[stable(feature = "simd_x86", since = "1.27.0")]
216pub unsafe fn _mm_max_ss(a: __m128, b: __m128) -> __m128 {
217    maxss(a, b)
218}
219
220/// Compares packed single-precision (32-bit) floating-point elements in `a` and
221/// `b`, and return the corresponding maximum values.
222///
223/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ps)
224#[inline]
225#[target_feature(enable = "sse")]
226#[cfg_attr(test, assert_instr(maxps))]
227#[stable(feature = "simd_x86", since = "1.27.0")]
228pub unsafe fn _mm_max_ps(a: __m128, b: __m128) -> __m128 {
229    // See the `test_mm_min_ps` test why this can't be implemented using `simd_fmax`.
230    maxps(a, b)
231}
232
233/// Bitwise AND of packed single-precision (32-bit) floating-point elements.
234///
235/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_ps)
236#[inline]
237#[target_feature(enable = "sse")]
238// i586 only seems to generate plain `and` instructions, so ignore it.
239#[cfg_attr(
240    all(test, any(target_arch = "x86_64", target_feature = "sse2")),
241    assert_instr(andps)
242)]
243#[stable(feature = "simd_x86", since = "1.27.0")]
244pub unsafe fn _mm_and_ps(a: __m128, b: __m128) -> __m128 {
245    let a: __m128i = mem::transmute(a);
246    let b: __m128i = mem::transmute(b);
247    mem::transmute(simd_and(a, b))
248}
249
250/// Bitwise AND-NOT of packed single-precision (32-bit) floating-point
251/// elements.
252///
253/// Computes `!a & b` for each bit in `a` and `b`.
254///
255/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_ps)
256#[inline]
257#[target_feature(enable = "sse")]
258// i586 only seems to generate plain `not` and `and` instructions, so ignore
259// it.
260#[cfg_attr(
261    all(test, any(target_arch = "x86_64", target_feature = "sse2")),
262    assert_instr(andnps)
263)]
264#[stable(feature = "simd_x86", since = "1.27.0")]
265pub unsafe fn _mm_andnot_ps(a: __m128, b: __m128) -> __m128 {
266    let a: __m128i = mem::transmute(a);
267    let b: __m128i = mem::transmute(b);
268    let mask: __m128i = mem::transmute(i32x4::splat(-1));
269    mem::transmute(simd_and(simd_xor(mask, a), b))
270}
271
272/// Bitwise OR of packed single-precision (32-bit) floating-point elements.
273///
274/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_ps)
275#[inline]
276#[target_feature(enable = "sse")]
277// i586 only seems to generate plain `or` instructions, so we ignore it.
278#[cfg_attr(
279    all(test, any(target_arch = "x86_64", target_feature = "sse2")),
280    assert_instr(orps)
281)]
282#[stable(feature = "simd_x86", since = "1.27.0")]
283pub unsafe fn _mm_or_ps(a: __m128, b: __m128) -> __m128 {
284    let a: __m128i = mem::transmute(a);
285    let b: __m128i = mem::transmute(b);
286    mem::transmute(simd_or(a, b))
287}
288
289/// Bitwise exclusive OR of packed single-precision (32-bit) floating-point
290/// elements.
291///
292/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_ps)
293#[inline]
294#[target_feature(enable = "sse")]
295// i586 only seems to generate plain `xor` instructions, so we ignore it.
296#[cfg_attr(
297    all(test, any(target_arch = "x86_64", target_feature = "sse2")),
298    assert_instr(xorps)
299)]
300#[stable(feature = "simd_x86", since = "1.27.0")]
301pub unsafe fn _mm_xor_ps(a: __m128, b: __m128) -> __m128 {
302    let a: __m128i = mem::transmute(a);
303    let b: __m128i = mem::transmute(b);
304    mem::transmute(simd_xor(a, b))
305}
306
307/// Compares the lowest `f32` of both inputs for equality. The lowest 32 bits of
308/// the result will be `0xffffffff` if the two inputs are equal, or `0`
309/// otherwise. The upper 96 bits of the result are the upper 96 bits of `a`.
310///
311/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ss)
312#[inline]
313#[target_feature(enable = "sse")]
314#[cfg_attr(test, assert_instr(cmpeqss))]
315#[stable(feature = "simd_x86", since = "1.27.0")]
316pub unsafe fn _mm_cmpeq_ss(a: __m128, b: __m128) -> __m128 {
317    cmpss(a, b, 0)
318}
319
320/// Compares the lowest `f32` of both inputs for less than. The lowest 32 bits
321/// of the result will be `0xffffffff` if `a.extract(0)` is less than
322/// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
323/// upper 96 bits of `a`.
324///
325/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ss)
326#[inline]
327#[target_feature(enable = "sse")]
328#[cfg_attr(test, assert_instr(cmpltss))]
329#[stable(feature = "simd_x86", since = "1.27.0")]
330pub unsafe fn _mm_cmplt_ss(a: __m128, b: __m128) -> __m128 {
331    cmpss(a, b, 1)
332}
333
334/// Compares the lowest `f32` of both inputs for less than or equal. The lowest
335/// 32 bits of the result will be `0xffffffff` if `a.extract(0)` is less than
336/// or equal `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result
337/// are the upper 96 bits of `a`.
338///
339/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ss)
340#[inline]
341#[target_feature(enable = "sse")]
342#[cfg_attr(test, assert_instr(cmpless))]
343#[stable(feature = "simd_x86", since = "1.27.0")]
344pub unsafe fn _mm_cmple_ss(a: __m128, b: __m128) -> __m128 {
345    cmpss(a, b, 2)
346}
347
348/// Compares the lowest `f32` of both inputs for greater than. The lowest 32
349/// bits of the result will be `0xffffffff` if `a.extract(0)` is greater
350/// than `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result
351/// are the upper 96 bits of `a`.
352///
353/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ss)
354#[inline]
355#[target_feature(enable = "sse")]
356#[cfg_attr(test, assert_instr(cmpltss))]
357#[stable(feature = "simd_x86", since = "1.27.0")]
358pub unsafe fn _mm_cmpgt_ss(a: __m128, b: __m128) -> __m128 {
359    simd_shuffle!(a, cmpss(b, a, 1), [4, 1, 2, 3])
360}
361
362/// Compares the lowest `f32` of both inputs for greater than or equal. The
363/// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is
364/// greater than or equal `b.extract(0)`, or `0` otherwise. The upper 96 bits
365/// of the result are the upper 96 bits of `a`.
366///
367/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ss)
368#[inline]
369#[target_feature(enable = "sse")]
370#[cfg_attr(test, assert_instr(cmpless))]
371#[stable(feature = "simd_x86", since = "1.27.0")]
372pub unsafe fn _mm_cmpge_ss(a: __m128, b: __m128) -> __m128 {
373    simd_shuffle!(a, cmpss(b, a, 2), [4, 1, 2, 3])
374}
375
376/// Compares the lowest `f32` of both inputs for inequality. The lowest 32 bits
377/// of the result will be `0xffffffff` if `a.extract(0)` is not equal to
378/// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
379/// upper 96 bits of `a`.
380///
381/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ss)
382#[inline]
383#[target_feature(enable = "sse")]
384#[cfg_attr(test, assert_instr(cmpneqss))]
385#[stable(feature = "simd_x86", since = "1.27.0")]
386pub unsafe fn _mm_cmpneq_ss(a: __m128, b: __m128) -> __m128 {
387    cmpss(a, b, 4)
388}
389
390/// Compares the lowest `f32` of both inputs for not-less-than. The lowest 32
391/// bits of the result will be `0xffffffff` if `a.extract(0)` is not less than
392/// `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are the
393/// upper 96 bits of `a`.
394///
395/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ss)
396#[inline]
397#[target_feature(enable = "sse")]
398#[cfg_attr(test, assert_instr(cmpnltss))]
399#[stable(feature = "simd_x86", since = "1.27.0")]
400pub unsafe fn _mm_cmpnlt_ss(a: __m128, b: __m128) -> __m128 {
401    cmpss(a, b, 5)
402}
403
404/// Compares the lowest `f32` of both inputs for not-less-than-or-equal. The
405/// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is not
406/// less than or equal to `b.extract(0)`, or `0` otherwise. The upper 96 bits
407/// of the result are the upper 96 bits of `a`.
408///
409/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ss)
410#[inline]
411#[target_feature(enable = "sse")]
412#[cfg_attr(test, assert_instr(cmpnless))]
413#[stable(feature = "simd_x86", since = "1.27.0")]
414pub unsafe fn _mm_cmpnle_ss(a: __m128, b: __m128) -> __m128 {
415    cmpss(a, b, 6)
416}
417
418/// Compares the lowest `f32` of both inputs for not-greater-than. The lowest 32
419/// bits of the result will be `0xffffffff` if `a.extract(0)` is not greater
420/// than `b.extract(0)`, or `0` otherwise. The upper 96 bits of the result are
421/// the upper 96 bits of `a`.
422///
423/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ss)
424#[inline]
425#[target_feature(enable = "sse")]
426#[cfg_attr(test, assert_instr(cmpnltss))]
427#[stable(feature = "simd_x86", since = "1.27.0")]
428pub unsafe fn _mm_cmpngt_ss(a: __m128, b: __m128) -> __m128 {
429    simd_shuffle!(a, cmpss(b, a, 5), [4, 1, 2, 3])
430}
431
432/// Compares the lowest `f32` of both inputs for not-greater-than-or-equal. The
433/// lowest 32 bits of the result will be `0xffffffff` if `a.extract(0)` is not
434/// greater than or equal to `b.extract(0)`, or `0` otherwise. The upper 96
435/// bits of the result are the upper 96 bits of `a`.
436///
437/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ss)
438#[inline]
439#[target_feature(enable = "sse")]
440#[cfg_attr(test, assert_instr(cmpnless))]
441#[stable(feature = "simd_x86", since = "1.27.0")]
442pub unsafe fn _mm_cmpnge_ss(a: __m128, b: __m128) -> __m128 {
443    simd_shuffle!(a, cmpss(b, a, 6), [4, 1, 2, 3])
444}
445
446/// Checks if the lowest `f32` of both inputs are ordered. The lowest 32 bits of
447/// the result will be `0xffffffff` if neither of `a.extract(0)` or
448/// `b.extract(0)` is a NaN, or `0` otherwise. The upper 96 bits of the result
449/// are the upper 96 bits of `a`.
450///
451/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ss)
452#[inline]
453#[target_feature(enable = "sse")]
454#[cfg_attr(test, assert_instr(cmpordss))]
455#[stable(feature = "simd_x86", since = "1.27.0")]
456pub unsafe fn _mm_cmpord_ss(a: __m128, b: __m128) -> __m128 {
457    cmpss(a, b, 7)
458}
459
460/// Checks if the lowest `f32` of both inputs are unordered. The lowest 32 bits
461/// of the result will be `0xffffffff` if any of `a.extract(0)` or
462/// `b.extract(0)` is a NaN, or `0` otherwise. The upper 96 bits of the result
463/// are the upper 96 bits of `a`.
464///
465/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ss)
466#[inline]
467#[target_feature(enable = "sse")]
468#[cfg_attr(test, assert_instr(cmpunordss))]
469#[stable(feature = "simd_x86", since = "1.27.0")]
470pub unsafe fn _mm_cmpunord_ss(a: __m128, b: __m128) -> __m128 {
471    cmpss(a, b, 3)
472}
473
474/// Compares each of the four floats in `a` to the corresponding element in `b`.
475/// The result in the output vector will be `0xffffffff` if the input elements
476/// were equal, or `0` otherwise.
477///
478/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ps)
479#[inline]
480#[target_feature(enable = "sse")]
481#[cfg_attr(test, assert_instr(cmpeqps))]
482#[stable(feature = "simd_x86", since = "1.27.0")]
483pub unsafe fn _mm_cmpeq_ps(a: __m128, b: __m128) -> __m128 {
484    cmpps(a, b, 0)
485}
486
487/// Compares each of the four floats in `a` to the corresponding element in `b`.
488/// The result in the output vector will be `0xffffffff` if the input element
489/// in `a` is less than the corresponding element in `b`, or `0` otherwise.
490///
491/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ps)
492#[inline]
493#[target_feature(enable = "sse")]
494#[cfg_attr(test, assert_instr(cmpltps))]
495#[stable(feature = "simd_x86", since = "1.27.0")]
496pub unsafe fn _mm_cmplt_ps(a: __m128, b: __m128) -> __m128 {
497    cmpps(a, b, 1)
498}
499
500/// Compares each of the four floats in `a` to the corresponding element in `b`.
501/// The result in the output vector will be `0xffffffff` if the input element
502/// in `a` is less than or equal to the corresponding element in `b`, or `0`
503/// otherwise.
504///
505/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ps)
506#[inline]
507#[target_feature(enable = "sse")]
508#[cfg_attr(test, assert_instr(cmpleps))]
509#[stable(feature = "simd_x86", since = "1.27.0")]
510pub unsafe fn _mm_cmple_ps(a: __m128, b: __m128) -> __m128 {
511    cmpps(a, b, 2)
512}
513
514/// Compares each of the four floats in `a` to the corresponding element in `b`.
515/// The result in the output vector will be `0xffffffff` if the input element
516/// in `a` is greater than the corresponding element in `b`, or `0` otherwise.
517///
518/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ps)
519#[inline]
520#[target_feature(enable = "sse")]
521#[cfg_attr(test, assert_instr(cmpltps))]
522#[stable(feature = "simd_x86", since = "1.27.0")]
523pub unsafe fn _mm_cmpgt_ps(a: __m128, b: __m128) -> __m128 {
524    cmpps(b, a, 1)
525}
526
527/// Compares each of the four floats in `a` to the corresponding element in `b`.
528/// The result in the output vector will be `0xffffffff` if the input element
529/// in `a` is greater than or equal to the corresponding element in `b`, or `0`
530/// otherwise.
531///
532/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ps)
533#[inline]
534#[target_feature(enable = "sse")]
535#[cfg_attr(test, assert_instr(cmpleps))]
536#[stable(feature = "simd_x86", since = "1.27.0")]
537pub unsafe fn _mm_cmpge_ps(a: __m128, b: __m128) -> __m128 {
538    cmpps(b, a, 2)
539}
540
541/// Compares each of the four floats in `a` to the corresponding element in `b`.
542/// The result in the output vector will be `0xffffffff` if the input elements
543/// are **not** equal, or `0` otherwise.
544///
545/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ps)
546#[inline]
547#[target_feature(enable = "sse")]
548#[cfg_attr(test, assert_instr(cmpneqps))]
549#[stable(feature = "simd_x86", since = "1.27.0")]
550pub unsafe fn _mm_cmpneq_ps(a: __m128, b: __m128) -> __m128 {
551    cmpps(a, b, 4)
552}
553
554/// Compares each of the four floats in `a` to the corresponding element in `b`.
555/// The result in the output vector will be `0xffffffff` if the input element
556/// in `a` is **not** less than the corresponding element in `b`, or `0`
557/// otherwise.
558///
559/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ps)
560#[inline]
561#[target_feature(enable = "sse")]
562#[cfg_attr(test, assert_instr(cmpnltps))]
563#[stable(feature = "simd_x86", since = "1.27.0")]
564pub unsafe fn _mm_cmpnlt_ps(a: __m128, b: __m128) -> __m128 {
565    cmpps(a, b, 5)
566}
567
568/// Compares each of the four floats in `a` to the corresponding element in `b`.
569/// The result in the output vector will be `0xffffffff` if the input element
570/// in `a` is **not** less than or equal to the corresponding element in `b`, or
571/// `0` otherwise.
572///
573/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ps)
574#[inline]
575#[target_feature(enable = "sse")]
576#[cfg_attr(test, assert_instr(cmpnleps))]
577#[stable(feature = "simd_x86", since = "1.27.0")]
578pub unsafe fn _mm_cmpnle_ps(a: __m128, b: __m128) -> __m128 {
579    cmpps(a, b, 6)
580}
581
582/// Compares each of the four floats in `a` to the corresponding element in `b`.
583/// The result in the output vector will be `0xffffffff` if the input element
584/// in `a` is **not** greater than the corresponding element in `b`, or `0`
585/// otherwise.
586///
587/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ps)
588#[inline]
589#[target_feature(enable = "sse")]
590#[cfg_attr(test, assert_instr(cmpnltps))]
591#[stable(feature = "simd_x86", since = "1.27.0")]
592pub unsafe fn _mm_cmpngt_ps(a: __m128, b: __m128) -> __m128 {
593    cmpps(b, a, 5)
594}
595
596/// Compares each of the four floats in `a` to the corresponding element in `b`.
597/// The result in the output vector will be `0xffffffff` if the input element
598/// in `a` is **not** greater than or equal to the corresponding element in `b`,
599/// or `0` otherwise.
600///
601/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ps)
602#[inline]
603#[target_feature(enable = "sse")]
604#[cfg_attr(test, assert_instr(cmpnleps))]
605#[stable(feature = "simd_x86", since = "1.27.0")]
606pub unsafe fn _mm_cmpnge_ps(a: __m128, b: __m128) -> __m128 {
607    cmpps(b, a, 6)
608}
609
610/// Compares each of the four floats in `a` to the corresponding element in `b`.
611/// Returns four floats that have one of two possible bit patterns. The element
612/// in the output vector will be `0xffffffff` if the input elements in `a` and
613/// `b` are ordered (i.e., neither of them is a NaN), or 0 otherwise.
614///
615/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ps)
616#[inline]
617#[target_feature(enable = "sse")]
618#[cfg_attr(test, assert_instr(cmpordps))]
619#[stable(feature = "simd_x86", since = "1.27.0")]
620pub unsafe fn _mm_cmpord_ps(a: __m128, b: __m128) -> __m128 {
621    cmpps(b, a, 7)
622}
623
624/// Compares each of the four floats in `a` to the corresponding element in `b`.
625/// Returns four floats that have one of two possible bit patterns. The element
626/// in the output vector will be `0xffffffff` if the input elements in `a` and
627/// `b` are unordered (i.e., at least on of them is a NaN), or 0 otherwise.
628///
629/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ps)
630#[inline]
631#[target_feature(enable = "sse")]
632#[cfg_attr(test, assert_instr(cmpunordps))]
633#[stable(feature = "simd_x86", since = "1.27.0")]
634pub unsafe fn _mm_cmpunord_ps(a: __m128, b: __m128) -> __m128 {
635    cmpps(b, a, 3)
636}
637
638/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
639/// `1` if they are equal, or `0` otherwise.
640///
641/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_ss)
642#[inline]
643#[target_feature(enable = "sse")]
644#[cfg_attr(test, assert_instr(comiss))]
645#[stable(feature = "simd_x86", since = "1.27.0")]
646pub unsafe fn _mm_comieq_ss(a: __m128, b: __m128) -> i32 {
647    comieq_ss(a, b)
648}
649
650/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
651/// `1` if the value from `a` is less than the one from `b`, or `0` otherwise.
652///
653/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_ss)
654#[inline]
655#[target_feature(enable = "sse")]
656#[cfg_attr(test, assert_instr(comiss))]
657#[stable(feature = "simd_x86", since = "1.27.0")]
658pub unsafe fn _mm_comilt_ss(a: __m128, b: __m128) -> i32 {
659    comilt_ss(a, b)
660}
661
662/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
663/// `1` if the value from `a` is less than or equal to the one from `b`, or `0`
664/// otherwise.
665///
666/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_ss)
667#[inline]
668#[target_feature(enable = "sse")]
669#[cfg_attr(test, assert_instr(comiss))]
670#[stable(feature = "simd_x86", since = "1.27.0")]
671pub unsafe fn _mm_comile_ss(a: __m128, b: __m128) -> i32 {
672    comile_ss(a, b)
673}
674
675/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
676/// `1` if the value from `a` is greater than the one from `b`, or `0`
677/// otherwise.
678///
679/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_ss)
680#[inline]
681#[target_feature(enable = "sse")]
682#[cfg_attr(test, assert_instr(comiss))]
683#[stable(feature = "simd_x86", since = "1.27.0")]
684pub unsafe fn _mm_comigt_ss(a: __m128, b: __m128) -> i32 {
685    comigt_ss(a, b)
686}
687
688/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
689/// `1` if the value from `a` is greater than or equal to the one from `b`, or
690/// `0` otherwise.
691///
692/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_ss)
693#[inline]
694#[target_feature(enable = "sse")]
695#[cfg_attr(test, assert_instr(comiss))]
696#[stable(feature = "simd_x86", since = "1.27.0")]
697pub unsafe fn _mm_comige_ss(a: __m128, b: __m128) -> i32 {
698    comige_ss(a, b)
699}
700
701/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
702/// `1` if they are **not** equal, or `0` otherwise.
703///
704/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_ss)
705#[inline]
706#[target_feature(enable = "sse")]
707#[cfg_attr(test, assert_instr(comiss))]
708#[stable(feature = "simd_x86", since = "1.27.0")]
709pub unsafe fn _mm_comineq_ss(a: __m128, b: __m128) -> i32 {
710    comineq_ss(a, b)
711}
712
713/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
714/// `1` if they are equal, or `0` otherwise. This instruction will not signal
715/// an exception if either argument is a quiet NaN.
716///
717/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomieq_ss)
718#[inline]
719#[target_feature(enable = "sse")]
720#[cfg_attr(test, assert_instr(ucomiss))]
721#[stable(feature = "simd_x86", since = "1.27.0")]
722pub unsafe fn _mm_ucomieq_ss(a: __m128, b: __m128) -> i32 {
723    ucomieq_ss(a, b)
724}
725
726/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
727/// `1` if the value from `a` is less than the one from `b`, or `0` otherwise.
728/// This instruction will not signal an exception if either argument is a quiet
729/// NaN.
730///
731/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomilt_ss)
732#[inline]
733#[target_feature(enable = "sse")]
734#[cfg_attr(test, assert_instr(ucomiss))]
735#[stable(feature = "simd_x86", since = "1.27.0")]
736pub unsafe fn _mm_ucomilt_ss(a: __m128, b: __m128) -> i32 {
737    ucomilt_ss(a, b)
738}
739
740/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
741/// `1` if the value from `a` is less than or equal to the one from `b`, or `0`
742/// otherwise. This instruction will not signal an exception if either argument
743/// is a quiet NaN.
744///
745/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomile_ss)
746#[inline]
747#[target_feature(enable = "sse")]
748#[cfg_attr(test, assert_instr(ucomiss))]
749#[stable(feature = "simd_x86", since = "1.27.0")]
750pub unsafe fn _mm_ucomile_ss(a: __m128, b: __m128) -> i32 {
751    ucomile_ss(a, b)
752}
753
754/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
755/// `1` if the value from `a` is greater than the one from `b`, or `0`
756/// otherwise. This instruction will not signal an exception if either argument
757/// is a quiet NaN.
758///
759/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomigt_ss)
760#[inline]
761#[target_feature(enable = "sse")]
762#[cfg_attr(test, assert_instr(ucomiss))]
763#[stable(feature = "simd_x86", since = "1.27.0")]
764pub unsafe fn _mm_ucomigt_ss(a: __m128, b: __m128) -> i32 {
765    ucomigt_ss(a, b)
766}
767
768/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
769/// `1` if the value from `a` is greater than or equal to the one from `b`, or
770/// `0` otherwise. This instruction will not signal an exception if either
771/// argument is a quiet NaN.
772///
773/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomige_ss)
774#[inline]
775#[target_feature(enable = "sse")]
776#[cfg_attr(test, assert_instr(ucomiss))]
777#[stable(feature = "simd_x86", since = "1.27.0")]
778pub unsafe fn _mm_ucomige_ss(a: __m128, b: __m128) -> i32 {
779    ucomige_ss(a, b)
780}
781
782/// Compares two 32-bit floats from the low-order bits of `a` and `b`. Returns
783/// `1` if they are **not** equal, or `0` otherwise. This instruction will not
784/// signal an exception if either argument is a quiet NaN.
785///
786/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ucomineq_ss)
787#[inline]
788#[target_feature(enable = "sse")]
789#[cfg_attr(test, assert_instr(ucomiss))]
790#[stable(feature = "simd_x86", since = "1.27.0")]
791pub unsafe fn _mm_ucomineq_ss(a: __m128, b: __m128) -> i32 {
792    ucomineq_ss(a, b)
793}
794
795/// Converts the lowest 32 bit float in the input vector to a 32 bit integer.
796///
797/// The result is rounded according to the current rounding mode. If the result
798/// cannot be represented as a 32 bit integer the result will be `0x8000_0000`
799/// (`i32::MIN`).
800///
801/// This corresponds to the `CVTSS2SI` instruction (with 32 bit output).
802///
803/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si32)
804#[inline]
805#[target_feature(enable = "sse")]
806#[cfg_attr(test, assert_instr(cvtss2si))]
807#[stable(feature = "simd_x86", since = "1.27.0")]
808pub unsafe fn _mm_cvtss_si32(a: __m128) -> i32 {
809    cvtss2si(a)
810}
811
812/// Alias for [`_mm_cvtss_si32`](fn._mm_cvtss_si32.html).
813///
814/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ss2si)
815#[inline]
816#[target_feature(enable = "sse")]
817#[cfg_attr(test, assert_instr(cvtss2si))]
818#[stable(feature = "simd_x86", since = "1.27.0")]
819pub unsafe fn _mm_cvt_ss2si(a: __m128) -> i32 {
820    _mm_cvtss_si32(a)
821}
822
823/// Converts the lowest 32 bit float in the input vector to a 32 bit integer
824/// with
825/// truncation.
826///
827/// The result is rounded always using truncation (round towards zero). If the
828/// result cannot be represented as a 32 bit integer the result will be
829/// `0x8000_0000` (`i32::MIN`).
830///
831/// This corresponds to the `CVTTSS2SI` instruction (with 32 bit output).
832///
833/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si32)
834#[inline]
835#[target_feature(enable = "sse")]
836#[cfg_attr(test, assert_instr(cvttss2si))]
837#[stable(feature = "simd_x86", since = "1.27.0")]
838pub unsafe fn _mm_cvttss_si32(a: __m128) -> i32 {
839    cvttss2si(a)
840}
841
842/// Alias for [`_mm_cvttss_si32`](fn._mm_cvttss_si32.html).
843///
844/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ss2si)
845#[inline]
846#[target_feature(enable = "sse")]
847#[cfg_attr(test, assert_instr(cvttss2si))]
848#[stable(feature = "simd_x86", since = "1.27.0")]
849pub unsafe fn _mm_cvtt_ss2si(a: __m128) -> i32 {
850    _mm_cvttss_si32(a)
851}
852
853/// Extracts the lowest 32 bit float from the input vector.
854///
855/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_f32)
856#[inline]
857#[target_feature(enable = "sse")]
858// No point in using assert_instrs. In Unix x86_64 calling convention this is a
859// no-op, and on msvc it's just a `mov`.
860#[stable(feature = "simd_x86", since = "1.27.0")]
861pub unsafe fn _mm_cvtss_f32(a: __m128) -> f32 {
862    simd_extract!(a, 0)
863}
864
865/// Converts a 32 bit integer to a 32 bit float. The result vector is the input
866/// vector `a` with the lowest 32 bit float replaced by the converted integer.
867///
868/// This intrinsic corresponds to the `CVTSI2SS` instruction (with 32 bit
869/// input).
870///
871/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_ss)
872#[inline]
873#[target_feature(enable = "sse")]
874#[cfg_attr(test, assert_instr(cvtsi2ss))]
875#[stable(feature = "simd_x86", since = "1.27.0")]
876pub unsafe fn _mm_cvtsi32_ss(a: __m128, b: i32) -> __m128 {
877    cvtsi2ss(a, b)
878}
879
880/// Alias for [`_mm_cvtsi32_ss`](fn._mm_cvtsi32_ss.html).
881///
882/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_si2ss)
883#[inline]
884#[target_feature(enable = "sse")]
885#[cfg_attr(test, assert_instr(cvtsi2ss))]
886#[stable(feature = "simd_x86", since = "1.27.0")]
887pub unsafe fn _mm_cvt_si2ss(a: __m128, b: i32) -> __m128 {
888    _mm_cvtsi32_ss(a, b)
889}
890
891/// Construct a `__m128` with the lowest element set to `a` and the rest set to
892/// zero.
893///
894/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ss)
895#[inline]
896#[target_feature(enable = "sse")]
897#[cfg_attr(test, assert_instr(movss))]
898#[stable(feature = "simd_x86", since = "1.27.0")]
899pub unsafe fn _mm_set_ss(a: f32) -> __m128 {
900    __m128([a, 0.0, 0.0, 0.0])
901}
902
903/// Construct a `__m128` with all element set to `a`.
904///
905/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_ps)
906#[inline]
907#[target_feature(enable = "sse")]
908#[cfg_attr(test, assert_instr(shufps))]
909#[stable(feature = "simd_x86", since = "1.27.0")]
910pub unsafe fn _mm_set1_ps(a: f32) -> __m128 {
911    __m128([a, a, a, a])
912}
913
914/// Alias for [`_mm_set1_ps`](fn._mm_set1_ps.html)
915///
916/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps1)
917#[inline]
918#[target_feature(enable = "sse")]
919#[cfg_attr(test, assert_instr(shufps))]
920#[stable(feature = "simd_x86", since = "1.27.0")]
921pub unsafe fn _mm_set_ps1(a: f32) -> __m128 {
922    _mm_set1_ps(a)
923}
924
925/// Construct a `__m128` from four floating point values highest to lowest.
926///
927/// Note that `a` will be the highest 32 bits of the result, and `d` the
928/// lowest. This matches the standard way of writing bit patterns on x86:
929///
930/// ```text
931///  bit    127 .. 96  95 .. 64  63 .. 32  31 .. 0
932///        +---------+---------+---------+---------+
933///        |    a    |    b    |    c    |    d    |   result
934///        +---------+---------+---------+---------+
935/// ```
936///
937/// Alternatively:
938///
939/// ```text
940/// let v = _mm_set_ps(d, c, b, a);
941/// ```
942///
943/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps)
944#[inline]
945#[target_feature(enable = "sse")]
946#[cfg_attr(test, assert_instr(unpcklps))]
947#[stable(feature = "simd_x86", since = "1.27.0")]
948pub unsafe fn _mm_set_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 {
949    __m128([d, c, b, a])
950}
951
952/// Construct a `__m128` from four floating point values lowest to highest.
953///
954/// This matches the memory order of `__m128`, i.e., `a` will be the lowest 32
955/// bits of the result, and `d` the highest.
956///
957/// ```text
958/// assert_eq!(__m128::new(a, b, c, d), _mm_setr_ps(a, b, c, d));
959/// ```
960///
961/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_ps)
962#[inline]
963#[target_feature(enable = "sse")]
964#[cfg_attr(
965    all(test, any(target_env = "msvc", target_arch = "x86_64")),
966    assert_instr(unpcklps)
967)]
968// On a 32-bit architecture on non-msvc it just copies the operands from the stack.
969#[cfg_attr(
970    all(test, all(not(target_env = "msvc"), target_arch = "x86")),
971    assert_instr(movaps)
972)]
973#[stable(feature = "simd_x86", since = "1.27.0")]
974pub unsafe fn _mm_setr_ps(a: f32, b: f32, c: f32, d: f32) -> __m128 {
975    __m128([a, b, c, d])
976}
977
978/// Construct a `__m128` with all elements initialized to zero.
979///
980/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_ps)
981#[inline]
982#[target_feature(enable = "sse")]
983#[cfg_attr(test, assert_instr(xorps))]
984#[stable(feature = "simd_x86", since = "1.27.0")]
985pub unsafe fn _mm_setzero_ps() -> __m128 {
986    const { mem::zeroed() }
987}
988
989/// A utility function for creating masks to use with Intel shuffle and
990/// permute intrinsics.
991#[inline]
992#[allow(non_snake_case)]
993#[unstable(feature = "stdarch_x86_mm_shuffle", issue = "111147")]
994pub const fn _MM_SHUFFLE(z: u32, y: u32, x: u32, w: u32) -> i32 {
995    ((z << 6) | (y << 4) | (x << 2) | w) as i32
996}
997
998/// Shuffles packed single-precision (32-bit) floating-point elements in `a` and
999/// `b` using `MASK`.
1000///
1001/// The lower half of result takes values from `a` and the higher half from
1002/// `b`. Mask is split to 2 control bits each to index the element from inputs.
1003///
1004/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_ps)
1005///
1006/// Note that there appears to be a mistake within Intel's Intrinsics Guide.
1007/// `_mm_shuffle_ps` is supposed to take an `i32` instead of a `u32`
1008/// as is the case for [other shuffle intrinsics](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_).
1009/// Performing an implicit type conversion between an unsigned integer and a signed integer
1010/// does not cause a problem in C, however Rust's commitment to strong typing does not allow this.
1011#[inline]
1012#[target_feature(enable = "sse")]
1013#[cfg_attr(test, assert_instr(shufps, MASK = 3))]
1014#[rustc_legacy_const_generics(2)]
1015#[stable(feature = "simd_x86", since = "1.27.0")]
1016pub unsafe fn _mm_shuffle_ps<const MASK: i32>(a: __m128, b: __m128) -> __m128 {
1017    static_assert_uimm_bits!(MASK, 8);
1018    simd_shuffle!(
1019        a,
1020        b,
1021        [
1022            MASK as u32 & 0b11,
1023            (MASK as u32 >> 2) & 0b11,
1024            ((MASK as u32 >> 4) & 0b11) + 4,
1025            ((MASK as u32 >> 6) & 0b11) + 4,
1026        ],
1027    )
1028}
1029
1030/// Unpacks and interleave single-precision (32-bit) floating-point elements
1031/// from the higher half of `a` and `b`.
1032///
1033/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_ps)
1034#[inline]
1035#[target_feature(enable = "sse")]
1036#[cfg_attr(test, assert_instr(unpckhps))]
1037#[stable(feature = "simd_x86", since = "1.27.0")]
1038pub unsafe fn _mm_unpackhi_ps(a: __m128, b: __m128) -> __m128 {
1039    simd_shuffle!(a, b, [2, 6, 3, 7])
1040}
1041
1042/// Unpacks and interleave single-precision (32-bit) floating-point elements
1043/// from the lower half of `a` and `b`.
1044///
1045/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_ps)
1046#[inline]
1047#[target_feature(enable = "sse")]
1048#[cfg_attr(test, assert_instr(unpcklps))]
1049#[stable(feature = "simd_x86", since = "1.27.0")]
1050pub unsafe fn _mm_unpacklo_ps(a: __m128, b: __m128) -> __m128 {
1051    simd_shuffle!(a, b, [0, 4, 1, 5])
1052}
1053
1054/// Combine higher half of `a` and `b`. The higher half of `b` occupies the
1055/// lower half of result.
1056///
1057/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehl_ps)
1058#[inline]
1059#[target_feature(enable = "sse")]
1060#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(movhlps))]
1061#[stable(feature = "simd_x86", since = "1.27.0")]
1062pub unsafe fn _mm_movehl_ps(a: __m128, b: __m128) -> __m128 {
1063    // TODO; figure why this is a different instruction on msvc?
1064    simd_shuffle!(a, b, [6, 7, 2, 3])
1065}
1066
1067/// Combine lower half of `a` and `b`. The lower half of `b` occupies the
1068/// higher half of result.
1069///
1070/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movelh_ps)
1071#[inline]
1072#[target_feature(enable = "sse")]
1073#[cfg_attr(all(test, not(target_env = "msvc")), assert_instr(movlhps))]
1074#[stable(feature = "simd_x86", since = "1.27.0")]
1075pub unsafe fn _mm_movelh_ps(a: __m128, b: __m128) -> __m128 {
1076    simd_shuffle!(a, b, [0, 1, 4, 5])
1077}
1078
1079/// Returns a mask of the most significant bit of each element in `a`.
1080///
1081/// The mask is stored in the 4 least significant bits of the return value.
1082/// All other bits are set to `0`.
1083///
1084/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_ps)
1085#[inline]
1086#[target_feature(enable = "sse")]
1087#[cfg_attr(test, assert_instr(movmskps))]
1088#[stable(feature = "simd_x86", since = "1.27.0")]
1089pub unsafe fn _mm_movemask_ps(a: __m128) -> i32 {
1090    // Propagate the highest bit to the rest, because simd_bitmask
1091    // requires all-1 or all-0.
1092    let mask: i32x4 = simd_lt(transmute(a), i32x4::ZERO);
1093    simd_bitmask::<i32x4, u8>(mask).into()
1094}
1095
1096/// Construct a `__m128` with the lowest element read from `p` and the other
1097/// elements set to zero.
1098///
1099/// This corresponds to instructions `VMOVSS` / `MOVSS`.
1100///
1101/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ss)
1102#[inline]
1103#[target_feature(enable = "sse")]
1104#[cfg_attr(test, assert_instr(movss))]
1105#[stable(feature = "simd_x86", since = "1.27.0")]
1106pub unsafe fn _mm_load_ss(p: *const f32) -> __m128 {
1107    __m128([*p, 0.0, 0.0, 0.0])
1108}
1109
1110/// Construct a `__m128` by duplicating the value read from `p` into all
1111/// elements.
1112///
1113/// This corresponds to instructions `VMOVSS` / `MOVSS` followed by some
1114/// shuffling.
1115///
1116/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_ps)
1117#[inline]
1118#[target_feature(enable = "sse")]
1119#[cfg_attr(test, assert_instr(movss))]
1120#[stable(feature = "simd_x86", since = "1.27.0")]
1121pub unsafe fn _mm_load1_ps(p: *const f32) -> __m128 {
1122    let a = *p;
1123    __m128([a, a, a, a])
1124}
1125
1126/// Alias for [`_mm_load1_ps`](fn._mm_load1_ps.html)
1127///
1128/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps1)
1129#[inline]
1130#[target_feature(enable = "sse")]
1131#[cfg_attr(test, assert_instr(movss))]
1132#[stable(feature = "simd_x86", since = "1.27.0")]
1133pub unsafe fn _mm_load_ps1(p: *const f32) -> __m128 {
1134    _mm_load1_ps(p)
1135}
1136
1137/// Loads four `f32` values from *aligned* memory into a `__m128`. If the
1138/// pointer is not aligned to a 128-bit boundary (16 bytes) a general
1139/// protection fault will be triggered (fatal program crash).
1140///
1141/// Use [`_mm_loadu_ps`](fn._mm_loadu_ps.html) for potentially unaligned
1142/// memory.
1143///
1144/// This corresponds to instructions `VMOVAPS` / `MOVAPS`.
1145///
1146/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps)
1147#[inline]
1148#[target_feature(enable = "sse")]
1149#[cfg_attr(test, assert_instr(movaps))]
1150#[stable(feature = "simd_x86", since = "1.27.0")]
1151#[allow(clippy::cast_ptr_alignment)]
1152pub unsafe fn _mm_load_ps(p: *const f32) -> __m128 {
1153    *(p as *const __m128)
1154}
1155
1156/// Loads four `f32` values from memory into a `__m128`. There are no
1157/// restrictions
1158/// on memory alignment. For aligned memory
1159/// [`_mm_load_ps`](fn._mm_load_ps.html)
1160/// may be faster.
1161///
1162/// This corresponds to instructions `VMOVUPS` / `MOVUPS`.
1163///
1164/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_ps)
1165#[inline]
1166#[target_feature(enable = "sse")]
1167#[cfg_attr(test, assert_instr(movups))]
1168#[stable(feature = "simd_x86", since = "1.27.0")]
1169pub unsafe fn _mm_loadu_ps(p: *const f32) -> __m128 {
1170    // Note: Using `*p` would require `f32` alignment, but `movups` has no
1171    // alignment restrictions.
1172    let mut dst = _mm_undefined_ps();
1173    ptr::copy_nonoverlapping(
1174        p as *const u8,
1175        ptr::addr_of_mut!(dst) as *mut u8,
1176        mem::size_of::<__m128>(),
1177    );
1178    dst
1179}
1180
1181/// Loads four `f32` values from aligned memory into a `__m128` in reverse
1182/// order.
1183///
1184/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
1185/// protection fault will be triggered (fatal program crash).
1186///
1187/// Functionally equivalent to the following code sequence (assuming `p`
1188/// satisfies the alignment restrictions):
1189///
1190/// ```text
1191/// let a0 = *p;
1192/// let a1 = *p.add(1);
1193/// let a2 = *p.add(2);
1194/// let a3 = *p.add(3);
1195/// __m128::new(a3, a2, a1, a0)
1196/// ```
1197///
1198/// This corresponds to instructions `VMOVAPS` / `MOVAPS` followed by some
1199/// shuffling.
1200///
1201/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_ps)
1202#[inline]
1203#[target_feature(enable = "sse")]
1204#[cfg_attr(test, assert_instr(movaps))]
1205#[stable(feature = "simd_x86", since = "1.27.0")]
1206pub unsafe fn _mm_loadr_ps(p: *const f32) -> __m128 {
1207    let a = _mm_load_ps(p);
1208    simd_shuffle!(a, a, [3, 2, 1, 0])
1209}
1210
1211/// Stores the lowest 32 bit float of `a` into memory.
1212///
1213/// This intrinsic corresponds to the `MOVSS` instruction.
1214///
1215/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ss)
1216#[inline]
1217#[target_feature(enable = "sse")]
1218#[cfg_attr(test, assert_instr(movss))]
1219#[stable(feature = "simd_x86", since = "1.27.0")]
1220pub unsafe fn _mm_store_ss(p: *mut f32, a: __m128) {
1221    *p = simd_extract!(a, 0);
1222}
1223
1224/// Stores the lowest 32 bit float of `a` repeated four times into *aligned*
1225/// memory.
1226///
1227/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
1228/// protection fault will be triggered (fatal program crash).
1229///
1230/// Functionally equivalent to the following code sequence (assuming `p`
1231/// satisfies the alignment restrictions):
1232///
1233/// ```text
1234/// let x = a.extract(0);
1235/// *p = x;
1236/// *p.add(1) = x;
1237/// *p.add(2) = x;
1238/// *p.add(3) = x;
1239/// ```
1240///
1241/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_ps)
1242#[inline]
1243#[target_feature(enable = "sse")]
1244#[cfg_attr(test, assert_instr(movaps))]
1245#[stable(feature = "simd_x86", since = "1.27.0")]
1246#[allow(clippy::cast_ptr_alignment)]
1247pub unsafe fn _mm_store1_ps(p: *mut f32, a: __m128) {
1248    let b: __m128 = simd_shuffle!(a, a, [0, 0, 0, 0]);
1249    *(p as *mut __m128) = b;
1250}
1251
1252/// Alias for [`_mm_store1_ps`](fn._mm_store1_ps.html)
1253///
1254/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps1)
1255#[inline]
1256#[target_feature(enable = "sse")]
1257#[cfg_attr(test, assert_instr(movaps))]
1258#[stable(feature = "simd_x86", since = "1.27.0")]
1259pub unsafe fn _mm_store_ps1(p: *mut f32, a: __m128) {
1260    _mm_store1_ps(p, a);
1261}
1262
1263/// Stores four 32-bit floats into *aligned* memory.
1264///
1265/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
1266/// protection fault will be triggered (fatal program crash).
1267///
1268/// Use [`_mm_storeu_ps`](fn._mm_storeu_ps.html) for potentially unaligned
1269/// memory.
1270///
1271/// This corresponds to instructions `VMOVAPS` / `MOVAPS`.
1272///
1273/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps)
1274#[inline]
1275#[target_feature(enable = "sse")]
1276#[cfg_attr(test, assert_instr(movaps))]
1277#[stable(feature = "simd_x86", since = "1.27.0")]
1278#[allow(clippy::cast_ptr_alignment)]
1279pub unsafe fn _mm_store_ps(p: *mut f32, a: __m128) {
1280    *(p as *mut __m128) = a;
1281}
1282
1283/// Stores four 32-bit floats into memory. There are no restrictions on memory
1284/// alignment. For aligned memory [`_mm_store_ps`](fn._mm_store_ps.html) may be
1285/// faster.
1286///
1287/// This corresponds to instructions `VMOVUPS` / `MOVUPS`.
1288///
1289/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_ps)
1290#[inline]
1291#[target_feature(enable = "sse")]
1292#[cfg_attr(test, assert_instr(movups))]
1293#[stable(feature = "simd_x86", since = "1.27.0")]
1294pub unsafe fn _mm_storeu_ps(p: *mut f32, a: __m128) {
1295    ptr::copy_nonoverlapping(
1296        ptr::addr_of!(a) as *const u8,
1297        p as *mut u8,
1298        mem::size_of::<__m128>(),
1299    );
1300}
1301
1302/// Stores four 32-bit floats into *aligned* memory in reverse order.
1303///
1304/// If the pointer is not aligned to a 128-bit boundary (16 bytes) a general
1305/// protection fault will be triggered (fatal program crash).
1306///
1307/// Functionally equivalent to the following code sequence (assuming `p`
1308/// satisfies the alignment restrictions):
1309///
1310/// ```text
1311/// *p = a.extract(3);
1312/// *p.add(1) = a.extract(2);
1313/// *p.add(2) = a.extract(1);
1314/// *p.add(3) = a.extract(0);
1315/// ```
1316///
1317/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_ps)
1318#[inline]
1319#[target_feature(enable = "sse")]
1320#[cfg_attr(test, assert_instr(movaps))]
1321#[stable(feature = "simd_x86", since = "1.27.0")]
1322#[allow(clippy::cast_ptr_alignment)]
1323pub unsafe fn _mm_storer_ps(p: *mut f32, a: __m128) {
1324    let b: __m128 = simd_shuffle!(a, a, [3, 2, 1, 0]);
1325    *(p as *mut __m128) = b;
1326}
1327
1328/// Returns a `__m128` with the first component from `b` and the remaining
1329/// components from `a`.
1330///
1331/// In other words for any `a` and `b`:
1332/// ```text
1333/// _mm_move_ss(a, b) == a.replace(0, b.extract(0))
1334/// ```
1335///
1336/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_ss)
1337#[inline]
1338#[target_feature(enable = "sse")]
1339#[cfg_attr(test, assert_instr(movss))]
1340#[stable(feature = "simd_x86", since = "1.27.0")]
1341pub unsafe fn _mm_move_ss(a: __m128, b: __m128) -> __m128 {
1342    simd_shuffle!(a, b, [4, 1, 2, 3])
1343}
1344
1345/// Performs a serializing operation on all non-temporal ("streaming") store instructions that
1346/// were issued by the current thread prior to this instruction.
1347///
1348/// Guarantees that every non-temporal store instruction that precedes this fence, in program order, is
1349/// ordered before any load or store instruction which follows the fence in
1350/// synchronization order.
1351///
1352/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sfence)
1353/// (but note that Intel is only documenting the hardware-level concerns related to this
1354/// instruction; the Intel documentation does not take into account the extra concerns that arise
1355/// because the Rust memory model is different from the x86 memory model.)
1356///
1357/// # Safety of non-temporal stores
1358///
1359/// After using any non-temporal store intrinsic, but before any other access to the memory that the
1360/// intrinsic mutates, a call to `_mm_sfence` must be performed on the thread that used the
1361/// intrinsic.
1362///
1363/// Non-temporal stores behave very different from regular stores. For the purpose of the Rust
1364/// memory model, these stores are happening asynchronously in a background thread. This means a
1365/// non-temporal store can cause data races with other accesses, even other accesses on the same
1366/// thread. It also means that cross-thread synchronization does not work as expected: let's say the
1367/// intrinsic is called on thread T1, and T1 performs synchronization with some other thread T2. The
1368/// non-temporal store acts as if it happened not in T1 but in a different thread T3, and T2 has not
1369/// synchronized with T3! Calling `_mm_sfence` makes the current thread wait for and synchronize
1370/// with all the non-temporal stores previously started on this thread, which means in particular
1371/// that subsequent synchronization with other threads will then work as intended again.
1372///
1373/// The general pattern to use non-temporal stores correctly is to call `_mm_sfence` before your
1374/// code jumps back to code outside your library. This ensures all stores inside your function
1375/// are synchronized-before the return, and thus transitively synchronized-before everything
1376/// the caller does after your function returns.
1377//
1378// The following is not a doc comment since it's not clear whether we want to put this into the
1379// docs, but it should be written out somewhere.
1380//
1381// Formally, we consider non-temporal stores and sfences to be opaque blobs that the compiler cannot
1382// inspect, and that behave like the following functions. This explains where the docs above come
1383// from.
1384// ```
1385// #[thread_local]
1386// static mut PENDING_NONTEMP_WRITES = AtomicUsize::new(0);
1387//
1388// pub unsafe fn nontemporal_store<T>(ptr: *mut T, val: T) {
1389//     PENDING_NONTEMP_WRITES.fetch_add(1, Relaxed);
1390//     // Spawn a thread that will eventually do our write.
1391//     // We need to fetch a pointer to this thread's pending-write
1392//     // counter, so that we can access it from the background thread.
1393//     let pending_writes = addr_of!(PENDING_NONTEMP_WRITES);
1394//     // If this was actual Rust code we'd have to do some extra work
1395//     // because `ptr`, `val`, `pending_writes` are all `!Send`. We skip that here.
1396//     std::thread::spawn(move || {
1397//         // Do the write in the background thread.
1398//         ptr.write(val);
1399//         // Register the write as done. Crucially, this is `Release`, so it
1400//         // syncs-with the `Acquire in `sfence`.
1401//         (&*pending_writes).fetch_sub(1, Release);
1402//     });
1403// }
1404//
1405// pub fn sfence() {
1406//     unsafe {
1407//         // Wait until there are no more pending writes.
1408//         while PENDING_NONTEMP_WRITES.load(Acquire) > 0 {}
1409//     }
1410// }
1411// ```
1412#[inline]
1413#[target_feature(enable = "sse")]
1414#[cfg_attr(test, assert_instr(sfence))]
1415#[stable(feature = "simd_x86", since = "1.27.0")]
1416pub unsafe fn _mm_sfence() {
1417    sfence()
1418}
1419
1420/// Gets the unsigned 32-bit value of the MXCSR control and status register.
1421///
1422/// Note that Rust makes no guarantees whatsoever about the contents of this register: Rust
1423/// floating-point operations may or may not result in this register getting updated with exception
1424/// state, and the register can change between two invocations of this function even when no
1425/// floating-point operations appear in the source code (since floating-point operations appearing
1426/// earlier or later can be reordered).
1427///
1428/// If you need to perform some floating-point operations and check whether they raised an
1429/// exception, use an inline assembly block for the entire sequence of operations.
1430///
1431/// For more info see [`_mm_setcsr`](fn._mm_setcsr.html)
1432///
1433/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getcsr)
1434#[inline]
1435#[target_feature(enable = "sse")]
1436#[cfg_attr(test, assert_instr(stmxcsr))]
1437#[stable(feature = "simd_x86", since = "1.27.0")]
1438#[deprecated(
1439    since = "1.75.0",
1440    note = "see `_mm_getcsr` documentation - use inline assembly instead"
1441)]
1442pub unsafe fn _mm_getcsr() -> u32 {
1443    let mut result = 0_i32;
1444    stmxcsr(ptr::addr_of_mut!(result) as *mut i8);
1445    result as u32
1446}
1447
1448/// Sets the MXCSR register with the 32-bit unsigned integer value.
1449///
1450/// This register controls how SIMD instructions handle floating point
1451/// operations. Modifying this register only affects the current thread.
1452///
1453/// It contains several groups of flags:
1454///
1455/// * *Exception flags* report which exceptions occurred since last they were reset.
1456///
1457/// * *Masking flags* can be used to mask (ignore) certain exceptions. By default
1458///   these flags are all set to 1, so all exceptions are masked. When
1459///   an exception is masked, the processor simply sets the exception flag and
1460///   continues the operation. If the exception is unmasked, the flag is also set
1461///   but additionally an exception handler is invoked.
1462///
1463/// * *Rounding mode flags* control the rounding mode of floating point
1464///   instructions.
1465///
1466/// * The *denormals-are-zero mode flag* turns all numbers which would be
1467///   denormalized (exponent bits are all zeros) into zeros.
1468///
1469/// Note that modifying the masking flags, rounding mode, or denormals-are-zero mode flags leads to
1470/// **immediate Undefined Behavior**: Rust assumes that these are always in their default state and
1471/// will optimize accordingly. This even applies when the register is altered and later reset to its
1472/// original value without any floating-point operations appearing in the source code between those
1473/// operations (since floating-point operations appearing earlier or later can be reordered).
1474///
1475/// If you need to perform some floating-point operations under a different masking flags, rounding
1476/// mode, or denormals-are-zero mode, use an inline assembly block and make sure to restore the
1477/// original MXCSR register state before the end of the block.
1478///
1479/// ## Exception Flags
1480///
1481/// * `_MM_EXCEPT_INVALID`: An invalid operation was performed (e.g., dividing
1482///   Infinity by Infinity).
1483///
1484/// * `_MM_EXCEPT_DENORM`: An operation attempted to operate on a denormalized
1485///   number. Mainly this can cause loss of precision.
1486///
1487/// * `_MM_EXCEPT_DIV_ZERO`: Division by zero occurred.
1488///
1489/// * `_MM_EXCEPT_OVERFLOW`: A numeric overflow exception occurred, i.e., a
1490///   result was too large to be represented (e.g., an `f32` with absolute
1491///   value greater than `2^128`).
1492///
1493/// * `_MM_EXCEPT_UNDERFLOW`: A numeric underflow exception occurred, i.e., a
1494///   result was too small to be represented in a normalized way (e.g., an
1495///   `f32` with absolute value smaller than `2^-126`.)
1496///
1497/// * `_MM_EXCEPT_INEXACT`: An inexact-result exception occurred (a.k.a.
1498///   precision exception). This means some precision was lost due to rounding.
1499///   For example, the fraction `1/3` cannot be represented accurately in a
1500///   32 or 64 bit float and computing it would cause this exception to be
1501///   raised. Precision exceptions are very common, so they are usually masked.
1502///
1503/// Exception flags can be read and set using the convenience functions
1504/// `_MM_GET_EXCEPTION_STATE` and `_MM_SET_EXCEPTION_STATE`. For example, to
1505/// check if an operation caused some overflow:
1506///
1507/// ```rust,ignore
1508/// _MM_SET_EXCEPTION_STATE(0); // clear all exception flags
1509///                             // perform calculations
1510/// if _MM_GET_EXCEPTION_STATE() & _MM_EXCEPT_OVERFLOW != 0 {
1511///     // handle overflow
1512/// }
1513/// ```
1514///
1515/// ## Masking Flags
1516///
1517/// There is one masking flag for each exception flag: `_MM_MASK_INVALID`,
1518/// `_MM_MASK_DENORM`, `_MM_MASK_DIV_ZERO`, `_MM_MASK_OVERFLOW`,
1519/// `_MM_MASK_UNDERFLOW`, `_MM_MASK_INEXACT`.
1520///
1521/// A single masking bit can be set via
1522///
1523/// ```rust,ignore
1524/// _MM_SET_EXCEPTION_MASK(_MM_MASK_UNDERFLOW);
1525/// ```
1526///
1527/// However, since mask bits are by default all set to 1, it is more common to
1528/// want to *disable* certain bits. For example, to unmask the underflow
1529/// exception, use:
1530///
1531/// ```rust,ignore
1532/// _mm_setcsr(_mm_getcsr() & !_MM_MASK_UNDERFLOW); // unmask underflow
1533/// exception
1534/// ```
1535///
1536/// Warning: an unmasked exception will cause an exception handler to be
1537/// called.
1538/// The standard handler will simply terminate the process. So, in this case
1539/// any underflow exception would terminate the current process with something
1540/// like `signal: 8, SIGFPE: erroneous arithmetic operation`.
1541///
1542/// ## Rounding Mode
1543///
1544/// The rounding mode is describe using two bits. It can be read and set using
1545/// the convenience wrappers `_MM_GET_ROUNDING_MODE()` and
1546/// `_MM_SET_ROUNDING_MODE(mode)`.
1547///
1548/// The rounding modes are:
1549///
1550/// * `_MM_ROUND_NEAREST`: (default) Round to closest to the infinite precision
1551///   value. If two values are equally close, round to even (i.e., least
1552///   significant bit will be zero).
1553///
1554/// * `_MM_ROUND_DOWN`: Round toward negative Infinity.
1555///
1556/// * `_MM_ROUND_UP`: Round toward positive Infinity.
1557///
1558/// * `_MM_ROUND_TOWARD_ZERO`: Round towards zero (truncate).
1559///
1560/// Example:
1561///
1562/// ```rust,ignore
1563/// _MM_SET_ROUNDING_MODE(_MM_ROUND_DOWN)
1564/// ```
1565///
1566/// ## Denormals-are-zero/Flush-to-zero Mode
1567///
1568/// If this bit is set, values that would be denormalized will be set to zero
1569/// instead. This is turned off by default.
1570///
1571/// You can read and enable/disable this mode via the helper functions
1572/// `_MM_GET_FLUSH_ZERO_MODE()` and `_MM_SET_FLUSH_ZERO_MODE()`:
1573///
1574/// ```rust,ignore
1575/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF); // turn off (default)
1576/// _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); // turn on
1577/// ```
1578///
1579///
1580/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setcsr)
1581#[inline]
1582#[target_feature(enable = "sse")]
1583#[cfg_attr(test, assert_instr(ldmxcsr))]
1584#[stable(feature = "simd_x86", since = "1.27.0")]
1585#[deprecated(
1586    since = "1.75.0",
1587    note = "see `_mm_setcsr` documentation - use inline assembly instead"
1588)]
1589pub unsafe fn _mm_setcsr(val: u32) {
1590    ldmxcsr(ptr::addr_of!(val) as *const i8);
1591}
1592
1593/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1594#[stable(feature = "simd_x86", since = "1.27.0")]
1595pub const _MM_EXCEPT_INVALID: u32 = 0x0001;
1596/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1597#[stable(feature = "simd_x86", since = "1.27.0")]
1598pub const _MM_EXCEPT_DENORM: u32 = 0x0002;
1599/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1600#[stable(feature = "simd_x86", since = "1.27.0")]
1601pub const _MM_EXCEPT_DIV_ZERO: u32 = 0x0004;
1602/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1603#[stable(feature = "simd_x86", since = "1.27.0")]
1604pub const _MM_EXCEPT_OVERFLOW: u32 = 0x0008;
1605/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1606#[stable(feature = "simd_x86", since = "1.27.0")]
1607pub const _MM_EXCEPT_UNDERFLOW: u32 = 0x0010;
1608/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1609#[stable(feature = "simd_x86", since = "1.27.0")]
1610pub const _MM_EXCEPT_INEXACT: u32 = 0x0020;
1611/// See [`_MM_GET_EXCEPTION_STATE`](fn._MM_GET_EXCEPTION_STATE.html)
1612#[stable(feature = "simd_x86", since = "1.27.0")]
1613pub const _MM_EXCEPT_MASK: u32 = 0x003f;
1614
1615/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1616#[stable(feature = "simd_x86", since = "1.27.0")]
1617pub const _MM_MASK_INVALID: u32 = 0x0080;
1618/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1619#[stable(feature = "simd_x86", since = "1.27.0")]
1620pub const _MM_MASK_DENORM: u32 = 0x0100;
1621/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1622#[stable(feature = "simd_x86", since = "1.27.0")]
1623pub const _MM_MASK_DIV_ZERO: u32 = 0x0200;
1624/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1625#[stable(feature = "simd_x86", since = "1.27.0")]
1626pub const _MM_MASK_OVERFLOW: u32 = 0x0400;
1627/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1628#[stable(feature = "simd_x86", since = "1.27.0")]
1629pub const _MM_MASK_UNDERFLOW: u32 = 0x0800;
1630/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1631#[stable(feature = "simd_x86", since = "1.27.0")]
1632pub const _MM_MASK_INEXACT: u32 = 0x1000;
1633/// See [`_MM_GET_EXCEPTION_MASK`](fn._MM_GET_EXCEPTION_MASK.html)
1634#[stable(feature = "simd_x86", since = "1.27.0")]
1635pub const _MM_MASK_MASK: u32 = 0x1f80;
1636
1637/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1638#[stable(feature = "simd_x86", since = "1.27.0")]
1639pub const _MM_ROUND_NEAREST: u32 = 0x0000;
1640/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1641#[stable(feature = "simd_x86", since = "1.27.0")]
1642pub const _MM_ROUND_DOWN: u32 = 0x2000;
1643/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1644#[stable(feature = "simd_x86", since = "1.27.0")]
1645pub const _MM_ROUND_UP: u32 = 0x4000;
1646/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1647#[stable(feature = "simd_x86", since = "1.27.0")]
1648pub const _MM_ROUND_TOWARD_ZERO: u32 = 0x6000;
1649
1650/// See [`_MM_GET_ROUNDING_MODE`](fn._MM_GET_ROUNDING_MODE.html)
1651#[stable(feature = "simd_x86", since = "1.27.0")]
1652pub const _MM_ROUND_MASK: u32 = 0x6000;
1653
1654/// See [`_MM_GET_FLUSH_ZERO_MODE`](fn._MM_GET_FLUSH_ZERO_MODE.html)
1655#[stable(feature = "simd_x86", since = "1.27.0")]
1656pub const _MM_FLUSH_ZERO_MASK: u32 = 0x8000;
1657/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1658#[stable(feature = "simd_x86", since = "1.27.0")]
1659pub const _MM_FLUSH_ZERO_ON: u32 = 0x8000;
1660/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1661#[stable(feature = "simd_x86", since = "1.27.0")]
1662pub const _MM_FLUSH_ZERO_OFF: u32 = 0x0000;
1663
1664/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1665///
1666/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_EXCEPTION_MASK)
1667#[inline]
1668#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1669#[allow(non_snake_case)]
1670#[target_feature(enable = "sse")]
1671#[stable(feature = "simd_x86", since = "1.27.0")]
1672#[deprecated(
1673    since = "1.75.0",
1674    note = "see `_mm_getcsr` documentation - use inline assembly instead"
1675)]
1676pub unsafe fn _MM_GET_EXCEPTION_MASK() -> u32 {
1677    _mm_getcsr() & _MM_MASK_MASK
1678}
1679
1680/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1681///
1682/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_EXCEPTION_STATE)
1683#[inline]
1684#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1685#[allow(non_snake_case)]
1686#[target_feature(enable = "sse")]
1687#[stable(feature = "simd_x86", since = "1.27.0")]
1688#[deprecated(
1689    since = "1.75.0",
1690    note = "see `_mm_getcsr` documentation - use inline assembly instead"
1691)]
1692pub unsafe fn _MM_GET_EXCEPTION_STATE() -> u32 {
1693    _mm_getcsr() & _MM_EXCEPT_MASK
1694}
1695
1696/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1697///
1698/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_FLUSH_ZERO_MODE)
1699#[inline]
1700#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1701#[allow(non_snake_case)]
1702#[target_feature(enable = "sse")]
1703#[stable(feature = "simd_x86", since = "1.27.0")]
1704#[deprecated(
1705    since = "1.75.0",
1706    note = "see `_mm_getcsr` documentation - use inline assembly instead"
1707)]
1708pub unsafe fn _MM_GET_FLUSH_ZERO_MODE() -> u32 {
1709    _mm_getcsr() & _MM_FLUSH_ZERO_MASK
1710}
1711
1712/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1713///
1714/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE)
1715#[inline]
1716#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1717#[allow(non_snake_case)]
1718#[target_feature(enable = "sse")]
1719#[stable(feature = "simd_x86", since = "1.27.0")]
1720#[deprecated(
1721    since = "1.75.0",
1722    note = "see `_mm_getcsr` documentation - use inline assembly instead"
1723)]
1724pub unsafe fn _MM_GET_ROUNDING_MODE() -> u32 {
1725    _mm_getcsr() & _MM_ROUND_MASK
1726}
1727
1728/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1729///
1730/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_EXCEPTION_MASK)
1731#[inline]
1732#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1733#[allow(non_snake_case)]
1734#[target_feature(enable = "sse")]
1735#[stable(feature = "simd_x86", since = "1.27.0")]
1736#[deprecated(
1737    since = "1.75.0",
1738    note = "see `_mm_setcsr` documentation - use inline assembly instead"
1739)]
1740pub unsafe fn _MM_SET_EXCEPTION_MASK(x: u32) {
1741    _mm_setcsr((_mm_getcsr() & !_MM_MASK_MASK) | x)
1742}
1743
1744/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1745///
1746/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_EXCEPTION_STATE)
1747#[inline]
1748#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1749#[allow(non_snake_case)]
1750#[target_feature(enable = "sse")]
1751#[stable(feature = "simd_x86", since = "1.27.0")]
1752#[deprecated(
1753    since = "1.75.0",
1754    note = "see `_mm_setcsr` documentation - use inline assembly instead"
1755)]
1756pub unsafe fn _MM_SET_EXCEPTION_STATE(x: u32) {
1757    _mm_setcsr((_mm_getcsr() & !_MM_EXCEPT_MASK) | x)
1758}
1759
1760/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1761///
1762/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_FLUSH_ZERO_MODE)
1763#[inline]
1764#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1765#[allow(non_snake_case)]
1766#[target_feature(enable = "sse")]
1767#[stable(feature = "simd_x86", since = "1.27.0")]
1768#[deprecated(
1769    since = "1.75.0",
1770    note = "see `_mm_setcsr` documentation - use inline assembly instead"
1771)]
1772pub unsafe fn _MM_SET_FLUSH_ZERO_MODE(x: u32) {
1773    let val = (_mm_getcsr() & !_MM_FLUSH_ZERO_MASK) | x;
1774    // println!("setting csr={:x}", val);
1775    _mm_setcsr(val)
1776}
1777
1778/// See [`_mm_setcsr`](fn._mm_setcsr.html)
1779///
1780/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE)
1781#[inline]
1782#[allow(deprecated)] // Deprecated function implemented on top of deprecated function
1783#[allow(non_snake_case)]
1784#[target_feature(enable = "sse")]
1785#[stable(feature = "simd_x86", since = "1.27.0")]
1786#[deprecated(
1787    since = "1.75.0",
1788    note = "see `_mm_setcsr` documentation - use inline assembly instead"
1789)]
1790pub unsafe fn _MM_SET_ROUNDING_MODE(x: u32) {
1791    _mm_setcsr((_mm_getcsr() & !_MM_ROUND_MASK) | x)
1792}
1793
1794/// See [`_mm_prefetch`](fn._mm_prefetch.html).
1795#[stable(feature = "simd_x86", since = "1.27.0")]
1796pub const _MM_HINT_T0: i32 = 3;
1797
1798/// See [`_mm_prefetch`](fn._mm_prefetch.html).
1799#[stable(feature = "simd_x86", since = "1.27.0")]
1800pub const _MM_HINT_T1: i32 = 2;
1801
1802/// See [`_mm_prefetch`](fn._mm_prefetch.html).
1803#[stable(feature = "simd_x86", since = "1.27.0")]
1804pub const _MM_HINT_T2: i32 = 1;
1805
1806/// See [`_mm_prefetch`](fn._mm_prefetch.html).
1807#[stable(feature = "simd_x86", since = "1.27.0")]
1808pub const _MM_HINT_NTA: i32 = 0;
1809
1810/// See [`_mm_prefetch`](fn._mm_prefetch.html).
1811#[stable(feature = "simd_x86", since = "1.27.0")]
1812pub const _MM_HINT_ET0: i32 = 7;
1813
1814/// See [`_mm_prefetch`](fn._mm_prefetch.html).
1815#[stable(feature = "simd_x86", since = "1.27.0")]
1816pub const _MM_HINT_ET1: i32 = 6;
1817
1818/// Fetch the cache line that contains address `p` using the given `STRATEGY`.
1819///
1820/// The `STRATEGY` must be one of:
1821///
1822/// * [`_MM_HINT_T0`](constant._MM_HINT_T0.html): Fetch into all levels of the
1823///   cache hierarchy.
1824///
1825/// * [`_MM_HINT_T1`](constant._MM_HINT_T1.html): Fetch into L2 and higher.
1826///
1827/// * [`_MM_HINT_T2`](constant._MM_HINT_T2.html): Fetch into L3 and higher or
1828///   an implementation-specific choice (e.g., L2 if there is no L3).
1829///
1830/// * [`_MM_HINT_NTA`](constant._MM_HINT_NTA.html): Fetch data using the
1831///   non-temporal access (NTA) hint. It may be a place closer than main memory
1832///   but outside of the cache hierarchy. This is used to reduce access latency
1833///   without polluting the cache.
1834///
1835/// * [`_MM_HINT_ET0`](constant._MM_HINT_ET0.html) and
1836///   [`_MM_HINT_ET1`](constant._MM_HINT_ET1.html) are similar to `_MM_HINT_T0`
1837///   and `_MM_HINT_T1` but indicate an anticipation to write to the address.
1838///
1839/// The actual implementation depends on the particular CPU. This instruction
1840/// is considered a hint, so the CPU is also free to simply ignore the request.
1841///
1842/// The amount of prefetched data depends on the cache line size of the
1843/// specific CPU, but it will be at least 32 bytes.
1844///
1845/// Common caveats:
1846///
1847/// * Most modern CPUs already automatically prefetch data based on predicted
1848///   access patterns.
1849///
1850/// * Data is usually not fetched if this would cause a TLB miss or a page
1851///   fault.
1852///
1853/// * Too much prefetching can cause unnecessary cache evictions.
1854///
1855/// * Prefetching may also fail if there are not enough memory-subsystem
1856///   resources (e.g., request buffers).
1857///
1858///
1859/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_prefetch)
1860#[inline]
1861#[target_feature(enable = "sse")]
1862#[cfg_attr(test, assert_instr(prefetcht0, STRATEGY = _MM_HINT_T0))]
1863#[cfg_attr(test, assert_instr(prefetcht1, STRATEGY = _MM_HINT_T1))]
1864#[cfg_attr(test, assert_instr(prefetcht2, STRATEGY = _MM_HINT_T2))]
1865#[cfg_attr(test, assert_instr(prefetchnta, STRATEGY = _MM_HINT_NTA))]
1866#[rustc_legacy_const_generics(1)]
1867#[stable(feature = "simd_x86", since = "1.27.0")]
1868pub unsafe fn _mm_prefetch<const STRATEGY: i32>(p: *const i8) {
1869    static_assert_uimm_bits!(STRATEGY, 3);
1870    // We use the `llvm.prefetch` intrinsic with `cache type` = 1 (data cache).
1871    // `locality` and `rw` are based on our `STRATEGY`.
1872    prefetch(p, (STRATEGY >> 2) & 1, STRATEGY & 3, 1);
1873}
1874
1875/// Returns vector of type __m128 with indeterminate elements.
1876/// Despite being "undefined", this is some valid value and not equivalent to [`mem::MaybeUninit`].
1877/// In practice, this is equivalent to [`mem::zeroed`].
1878///
1879/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_ps)
1880#[inline]
1881#[target_feature(enable = "sse")]
1882#[stable(feature = "simd_x86", since = "1.27.0")]
1883pub unsafe fn _mm_undefined_ps() -> __m128 {
1884    const { mem::zeroed() }
1885}
1886
1887/// Transpose the 4x4 matrix formed by 4 rows of __m128 in place.
1888///
1889/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_TRANSPOSE4_PS)
1890#[inline]
1891#[allow(non_snake_case)]
1892#[target_feature(enable = "sse")]
1893#[stable(feature = "simd_x86", since = "1.27.0")]
1894pub unsafe fn _MM_TRANSPOSE4_PS(
1895    row0: &mut __m128,
1896    row1: &mut __m128,
1897    row2: &mut __m128,
1898    row3: &mut __m128,
1899) {
1900    let tmp0 = _mm_unpacklo_ps(*row0, *row1);
1901    let tmp2 = _mm_unpacklo_ps(*row2, *row3);
1902    let tmp1 = _mm_unpackhi_ps(*row0, *row1);
1903    let tmp3 = _mm_unpackhi_ps(*row2, *row3);
1904
1905    *row0 = _mm_movelh_ps(tmp0, tmp2);
1906    *row1 = _mm_movehl_ps(tmp2, tmp0);
1907    *row2 = _mm_movelh_ps(tmp1, tmp3);
1908    *row3 = _mm_movehl_ps(tmp3, tmp1);
1909}
1910
1911#[allow(improper_ctypes)]
1912extern "C" {
1913    #[link_name = "llvm.x86.sse.rcp.ss"]
1914    fn rcpss(a: __m128) -> __m128;
1915    #[link_name = "llvm.x86.sse.rcp.ps"]
1916    fn rcpps(a: __m128) -> __m128;
1917    #[link_name = "llvm.x86.sse.rsqrt.ss"]
1918    fn rsqrtss(a: __m128) -> __m128;
1919    #[link_name = "llvm.x86.sse.rsqrt.ps"]
1920    fn rsqrtps(a: __m128) -> __m128;
1921    #[link_name = "llvm.x86.sse.min.ss"]
1922    fn minss(a: __m128, b: __m128) -> __m128;
1923    #[link_name = "llvm.x86.sse.min.ps"]
1924    fn minps(a: __m128, b: __m128) -> __m128;
1925    #[link_name = "llvm.x86.sse.max.ss"]
1926    fn maxss(a: __m128, b: __m128) -> __m128;
1927    #[link_name = "llvm.x86.sse.max.ps"]
1928    fn maxps(a: __m128, b: __m128) -> __m128;
1929    #[link_name = "llvm.x86.sse.cmp.ps"]
1930    fn cmpps(a: __m128, b: __m128, imm8: i8) -> __m128;
1931    #[link_name = "llvm.x86.sse.comieq.ss"]
1932    fn comieq_ss(a: __m128, b: __m128) -> i32;
1933    #[link_name = "llvm.x86.sse.comilt.ss"]
1934    fn comilt_ss(a: __m128, b: __m128) -> i32;
1935    #[link_name = "llvm.x86.sse.comile.ss"]
1936    fn comile_ss(a: __m128, b: __m128) -> i32;
1937    #[link_name = "llvm.x86.sse.comigt.ss"]
1938    fn comigt_ss(a: __m128, b: __m128) -> i32;
1939    #[link_name = "llvm.x86.sse.comige.ss"]
1940    fn comige_ss(a: __m128, b: __m128) -> i32;
1941    #[link_name = "llvm.x86.sse.comineq.ss"]
1942    fn comineq_ss(a: __m128, b: __m128) -> i32;
1943    #[link_name = "llvm.x86.sse.ucomieq.ss"]
1944    fn ucomieq_ss(a: __m128, b: __m128) -> i32;
1945    #[link_name = "llvm.x86.sse.ucomilt.ss"]
1946    fn ucomilt_ss(a: __m128, b: __m128) -> i32;
1947    #[link_name = "llvm.x86.sse.ucomile.ss"]
1948    fn ucomile_ss(a: __m128, b: __m128) -> i32;
1949    #[link_name = "llvm.x86.sse.ucomigt.ss"]
1950    fn ucomigt_ss(a: __m128, b: __m128) -> i32;
1951    #[link_name = "llvm.x86.sse.ucomige.ss"]
1952    fn ucomige_ss(a: __m128, b: __m128) -> i32;
1953    #[link_name = "llvm.x86.sse.ucomineq.ss"]
1954    fn ucomineq_ss(a: __m128, b: __m128) -> i32;
1955    #[link_name = "llvm.x86.sse.cvtss2si"]
1956    fn cvtss2si(a: __m128) -> i32;
1957    #[link_name = "llvm.x86.sse.cvttss2si"]
1958    fn cvttss2si(a: __m128) -> i32;
1959    #[link_name = "llvm.x86.sse.cvtsi2ss"]
1960    fn cvtsi2ss(a: __m128, b: i32) -> __m128;
1961    #[link_name = "llvm.x86.sse.sfence"]
1962    fn sfence();
1963    #[link_name = "llvm.x86.sse.stmxcsr"]
1964    fn stmxcsr(p: *mut i8);
1965    #[link_name = "llvm.x86.sse.ldmxcsr"]
1966    fn ldmxcsr(p: *const i8);
1967    #[link_name = "llvm.prefetch"]
1968    fn prefetch(p: *const i8, rw: i32, loc: i32, ty: i32);
1969    #[link_name = "llvm.x86.sse.cmp.ss"]
1970    fn cmpss(a: __m128, b: __m128, imm8: i8) -> __m128;
1971}
1972
1973/// Stores `a` into the memory at `mem_addr` using a non-temporal memory hint.
1974///
1975/// `mem_addr` must be aligned on a 16-byte boundary or a general-protection
1976/// exception _may_ be generated.
1977///
1978/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_ps)
1979///
1980/// # Safety of non-temporal stores
1981///
1982/// After using this intrinsic, but before any other access to the memory that this intrinsic
1983/// mutates, a call to [`_mm_sfence`] must be performed by the thread that used the intrinsic. In
1984/// particular, functions that call this intrinsic should generally call `_mm_sfence` before they
1985/// return.
1986///
1987/// See [`_mm_sfence`] for details.
1988#[inline]
1989#[target_feature(enable = "sse")]
1990#[cfg_attr(test, assert_instr(movntps))]
1991#[stable(feature = "simd_x86", since = "1.27.0")]
1992#[allow(clippy::cast_ptr_alignment)]
1993pub unsafe fn _mm_stream_ps(mem_addr: *mut f32, a: __m128) {
1994    crate::arch::asm!(
1995        vps!("movntps", ",{a}"),
1996        p = in(reg) mem_addr,
1997        a = in(xmm_reg) a,
1998        options(nostack, preserves_flags),
1999    );
2000}
2001
2002#[cfg(test)]
2003mod tests {
2004    use crate::{hint::black_box, mem::transmute, ptr};
2005    use std::boxed;
2006    use stdarch_test::simd_test;
2007
2008    use crate::core_arch::{simd::*, x86::*};
2009
2010    const NAN: f32 = f32::NAN;
2011
2012    #[simd_test(enable = "sse")]
2013    unsafe fn test_mm_add_ps() {
2014        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2015        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2016        let r = _mm_add_ps(a, b);
2017        assert_eq_m128(r, _mm_setr_ps(-101.0, 25.0, 0.0, -15.0));
2018    }
2019
2020    #[simd_test(enable = "sse")]
2021    unsafe fn test_mm_add_ss() {
2022        let a = _mm_set_ps(-1.0, 5.0, 0.0, -10.0);
2023        let b = _mm_set_ps(-100.0, 20.0, 0.0, -5.0);
2024        let r = _mm_add_ss(a, b);
2025        assert_eq_m128(r, _mm_set_ps(-1.0, 5.0, 0.0, -15.0));
2026    }
2027
2028    #[simd_test(enable = "sse")]
2029    unsafe fn test_mm_sub_ps() {
2030        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2031        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2032        let r = _mm_sub_ps(a, b);
2033        assert_eq_m128(r, _mm_setr_ps(99.0, -15.0, 0.0, -5.0));
2034    }
2035
2036    #[simd_test(enable = "sse")]
2037    unsafe fn test_mm_sub_ss() {
2038        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2039        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2040        let r = _mm_sub_ss(a, b);
2041        assert_eq_m128(r, _mm_setr_ps(99.0, 5.0, 0.0, -10.0));
2042    }
2043
2044    #[simd_test(enable = "sse")]
2045    unsafe fn test_mm_mul_ps() {
2046        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2047        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2048        let r = _mm_mul_ps(a, b);
2049        assert_eq_m128(r, _mm_setr_ps(100.0, 100.0, 0.0, 50.0));
2050    }
2051
2052    #[simd_test(enable = "sse")]
2053    unsafe fn test_mm_mul_ss() {
2054        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2055        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2056        let r = _mm_mul_ss(a, b);
2057        assert_eq_m128(r, _mm_setr_ps(100.0, 5.0, 0.0, -10.0));
2058    }
2059
2060    #[simd_test(enable = "sse")]
2061    unsafe fn test_mm_div_ps() {
2062        let a = _mm_setr_ps(-1.0, 5.0, 2.0, -10.0);
2063        let b = _mm_setr_ps(-100.0, 20.0, 0.2, -5.0);
2064        let r = _mm_div_ps(a, b);
2065        assert_eq_m128(r, _mm_setr_ps(0.01, 0.25, 10.0, 2.0));
2066    }
2067
2068    #[simd_test(enable = "sse")]
2069    unsafe fn test_mm_div_ss() {
2070        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2071        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2072        let r = _mm_div_ss(a, b);
2073        assert_eq_m128(r, _mm_setr_ps(0.01, 5.0, 0.0, -10.0));
2074    }
2075
2076    #[simd_test(enable = "sse")]
2077    unsafe fn test_mm_sqrt_ss() {
2078        let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
2079        let r = _mm_sqrt_ss(a);
2080        let e = _mm_setr_ps(2.0, 13.0, 16.0, 100.0);
2081        assert_eq_m128(r, e);
2082    }
2083
2084    #[simd_test(enable = "sse")]
2085    unsafe fn test_mm_sqrt_ps() {
2086        let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
2087        let r = _mm_sqrt_ps(a);
2088        let e = _mm_setr_ps(2.0, 3.6055512, 4.0, 10.0);
2089        assert_eq_m128(r, e);
2090    }
2091
2092    #[simd_test(enable = "sse")]
2093    unsafe fn test_mm_rcp_ss() {
2094        let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
2095        let r = _mm_rcp_ss(a);
2096        let e = _mm_setr_ps(0.24993896, 13.0, 16.0, 100.0);
2097        let rel_err = 0.00048828125;
2098        assert_approx_eq!(get_m128(r, 0), get_m128(e, 0), 2. * rel_err);
2099        for i in 1..4 {
2100            assert_eq!(get_m128(r, i), get_m128(e, i));
2101        }
2102    }
2103
2104    #[simd_test(enable = "sse")]
2105    unsafe fn test_mm_rcp_ps() {
2106        let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
2107        let r = _mm_rcp_ps(a);
2108        let e = _mm_setr_ps(0.24993896, 0.0769043, 0.06248474, 0.0099983215);
2109        let rel_err = 0.00048828125;
2110        for i in 0..4 {
2111            assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err);
2112        }
2113    }
2114
2115    #[simd_test(enable = "sse")]
2116    unsafe fn test_mm_rsqrt_ss() {
2117        let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
2118        let r = _mm_rsqrt_ss(a);
2119        let e = _mm_setr_ps(0.49987793, 13.0, 16.0, 100.0);
2120        let rel_err = 0.00048828125;
2121        for i in 0..4 {
2122            assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err);
2123        }
2124    }
2125
2126    #[simd_test(enable = "sse")]
2127    unsafe fn test_mm_rsqrt_ps() {
2128        let a = _mm_setr_ps(4.0, 13.0, 16.0, 100.0);
2129        let r = _mm_rsqrt_ps(a);
2130        let e = _mm_setr_ps(0.49987793, 0.2772827, 0.24993896, 0.099990845);
2131        let rel_err = 0.00048828125;
2132        for i in 0..4 {
2133            assert_approx_eq!(get_m128(r, i), get_m128(e, i), 2. * rel_err);
2134        }
2135    }
2136
2137    #[simd_test(enable = "sse")]
2138    unsafe fn test_mm_min_ss() {
2139        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2140        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2141        let r = _mm_min_ss(a, b);
2142        assert_eq_m128(r, _mm_setr_ps(-100.0, 5.0, 0.0, -10.0));
2143    }
2144
2145    #[simd_test(enable = "sse")]
2146    unsafe fn test_mm_min_ps() {
2147        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2148        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2149        let r = _mm_min_ps(a, b);
2150        assert_eq_m128(r, _mm_setr_ps(-100.0, 5.0, 0.0, -10.0));
2151
2152        // `_mm_min_ps` can **not** be implemented using the `simd_min` rust intrinsic. `simd_min`
2153        // is lowered by the llvm codegen backend to `llvm.minnum.v*` llvm intrinsic. This intrinsic
2154        // doesn't specify how -0.0 is handled. Unfortunately it happens to behave different from
2155        // the `minps` x86 instruction on x86. The `llvm.minnum.v*` llvm intrinsic equals
2156        // `r1` to `a` and `r2` to `b`.
2157        let a = _mm_setr_ps(-0.0, 0.0, 0.0, 0.0);
2158        let b = _mm_setr_ps(0.0, 0.0, 0.0, 0.0);
2159        let r1: [u8; 16] = transmute(_mm_min_ps(a, b));
2160        let r2: [u8; 16] = transmute(_mm_min_ps(b, a));
2161        let a: [u8; 16] = transmute(a);
2162        let b: [u8; 16] = transmute(b);
2163        assert_eq!(r1, b);
2164        assert_eq!(r2, a);
2165        assert_ne!(a, b); // sanity check that -0.0 is actually present
2166    }
2167
2168    #[simd_test(enable = "sse")]
2169    unsafe fn test_mm_max_ss() {
2170        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2171        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2172        let r = _mm_max_ss(a, b);
2173        assert_eq_m128(r, _mm_setr_ps(-1.0, 5.0, 0.0, -10.0));
2174    }
2175
2176    #[simd_test(enable = "sse")]
2177    unsafe fn test_mm_max_ps() {
2178        let a = _mm_setr_ps(-1.0, 5.0, 0.0, -10.0);
2179        let b = _mm_setr_ps(-100.0, 20.0, 0.0, -5.0);
2180        let r = _mm_max_ps(a, b);
2181        assert_eq_m128(r, _mm_setr_ps(-1.0, 20.0, 0.0, -5.0));
2182
2183        // Check SSE-specific semantics for -0.0 handling.
2184        let a = _mm_setr_ps(-0.0, 0.0, 0.0, 0.0);
2185        let b = _mm_setr_ps(0.0, 0.0, 0.0, 0.0);
2186        let r1: [u8; 16] = transmute(_mm_max_ps(a, b));
2187        let r2: [u8; 16] = transmute(_mm_max_ps(b, a));
2188        let a: [u8; 16] = transmute(a);
2189        let b: [u8; 16] = transmute(b);
2190        assert_eq!(r1, b);
2191        assert_eq!(r2, a);
2192        assert_ne!(a, b); // sanity check that -0.0 is actually present
2193    }
2194
2195    #[simd_test(enable = "sse")]
2196    unsafe fn test_mm_and_ps() {
2197        let a = transmute(u32x4::splat(0b0011));
2198        let b = transmute(u32x4::splat(0b0101));
2199        let r = _mm_and_ps(*black_box(&a), *black_box(&b));
2200        let e = transmute(u32x4::splat(0b0001));
2201        assert_eq_m128(r, e);
2202    }
2203
2204    #[simd_test(enable = "sse")]
2205    unsafe fn test_mm_andnot_ps() {
2206        let a = transmute(u32x4::splat(0b0011));
2207        let b = transmute(u32x4::splat(0b0101));
2208        let r = _mm_andnot_ps(*black_box(&a), *black_box(&b));
2209        let e = transmute(u32x4::splat(0b0100));
2210        assert_eq_m128(r, e);
2211    }
2212
2213    #[simd_test(enable = "sse")]
2214    unsafe fn test_mm_or_ps() {
2215        let a = transmute(u32x4::splat(0b0011));
2216        let b = transmute(u32x4::splat(0b0101));
2217        let r = _mm_or_ps(*black_box(&a), *black_box(&b));
2218        let e = transmute(u32x4::splat(0b0111));
2219        assert_eq_m128(r, e);
2220    }
2221
2222    #[simd_test(enable = "sse")]
2223    unsafe fn test_mm_xor_ps() {
2224        let a = transmute(u32x4::splat(0b0011));
2225        let b = transmute(u32x4::splat(0b0101));
2226        let r = _mm_xor_ps(*black_box(&a), *black_box(&b));
2227        let e = transmute(u32x4::splat(0b0110));
2228        assert_eq_m128(r, e);
2229    }
2230
2231    #[simd_test(enable = "sse")]
2232    unsafe fn test_mm_cmpeq_ss() {
2233        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2234        let b = _mm_setr_ps(-1.0, 5.0, 6.0, 7.0);
2235        let r: u32x4 = transmute(_mm_cmpeq_ss(a, b));
2236        let e: u32x4 = transmute(_mm_setr_ps(f32::from_bits(0), 2.0, 3.0, 4.0));
2237        assert_eq!(r, e);
2238
2239        let b2 = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2240        let r2: u32x4 = transmute(_mm_cmpeq_ss(a, b2));
2241        let e2: u32x4 = transmute(_mm_setr_ps(f32::from_bits(0xffffffff), 2.0, 3.0, 4.0));
2242        assert_eq!(r2, e2);
2243    }
2244
2245    #[simd_test(enable = "sse")]
2246    unsafe fn test_mm_cmplt_ss() {
2247        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2248        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2249        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2250        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2251
2252        let b1 = 0u32; // a.extract(0) < b.extract(0)
2253        let c1 = 0u32; // a.extract(0) < c.extract(0)
2254        let d1 = !0u32; // a.extract(0) < d.extract(0)
2255
2256        let rb: u32x4 = transmute(_mm_cmplt_ss(a, b));
2257        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
2258        assert_eq!(rb, eb);
2259
2260        let rc: u32x4 = transmute(_mm_cmplt_ss(a, c));
2261        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
2262        assert_eq!(rc, ec);
2263
2264        let rd: u32x4 = transmute(_mm_cmplt_ss(a, d));
2265        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
2266        assert_eq!(rd, ed);
2267    }
2268
2269    #[simd_test(enable = "sse")]
2270    unsafe fn test_mm_cmple_ss() {
2271        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2272        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2273        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2274        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2275
2276        let b1 = 0u32; // a.extract(0) <= b.extract(0)
2277        let c1 = !0u32; // a.extract(0) <= c.extract(0)
2278        let d1 = !0u32; // a.extract(0) <= d.extract(0)
2279
2280        let rb: u32x4 = transmute(_mm_cmple_ss(a, b));
2281        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
2282        assert_eq!(rb, eb);
2283
2284        let rc: u32x4 = transmute(_mm_cmple_ss(a, c));
2285        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
2286        assert_eq!(rc, ec);
2287
2288        let rd: u32x4 = transmute(_mm_cmple_ss(a, d));
2289        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
2290        assert_eq!(rd, ed);
2291    }
2292
2293    #[simd_test(enable = "sse")]
2294    unsafe fn test_mm_cmpgt_ss() {
2295        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2296        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2297        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2298        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2299
2300        let b1 = !0u32; // a.extract(0) > b.extract(0)
2301        let c1 = 0u32; // a.extract(0) > c.extract(0)
2302        let d1 = 0u32; // a.extract(0) > d.extract(0)
2303
2304        let rb: u32x4 = transmute(_mm_cmpgt_ss(a, b));
2305        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
2306        assert_eq!(rb, eb);
2307
2308        let rc: u32x4 = transmute(_mm_cmpgt_ss(a, c));
2309        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
2310        assert_eq!(rc, ec);
2311
2312        let rd: u32x4 = transmute(_mm_cmpgt_ss(a, d));
2313        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
2314        assert_eq!(rd, ed);
2315    }
2316
2317    #[simd_test(enable = "sse")]
2318    unsafe fn test_mm_cmpge_ss() {
2319        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2320        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2321        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2322        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2323
2324        let b1 = !0u32; // a.extract(0) >= b.extract(0)
2325        let c1 = !0u32; // a.extract(0) >= c.extract(0)
2326        let d1 = 0u32; // a.extract(0) >= d.extract(0)
2327
2328        let rb: u32x4 = transmute(_mm_cmpge_ss(a, b));
2329        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
2330        assert_eq!(rb, eb);
2331
2332        let rc: u32x4 = transmute(_mm_cmpge_ss(a, c));
2333        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
2334        assert_eq!(rc, ec);
2335
2336        let rd: u32x4 = transmute(_mm_cmpge_ss(a, d));
2337        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
2338        assert_eq!(rd, ed);
2339    }
2340
2341    #[simd_test(enable = "sse")]
2342    unsafe fn test_mm_cmpneq_ss() {
2343        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2344        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2345        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2346        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2347
2348        let b1 = !0u32; // a.extract(0) != b.extract(0)
2349        let c1 = 0u32; // a.extract(0) != c.extract(0)
2350        let d1 = !0u32; // a.extract(0) != d.extract(0)
2351
2352        let rb: u32x4 = transmute(_mm_cmpneq_ss(a, b));
2353        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
2354        assert_eq!(rb, eb);
2355
2356        let rc: u32x4 = transmute(_mm_cmpneq_ss(a, c));
2357        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
2358        assert_eq!(rc, ec);
2359
2360        let rd: u32x4 = transmute(_mm_cmpneq_ss(a, d));
2361        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
2362        assert_eq!(rd, ed);
2363    }
2364
2365    #[simd_test(enable = "sse")]
2366    unsafe fn test_mm_cmpnlt_ss() {
2367        // TODO: this test is exactly the same as for `_mm_cmpge_ss`, but there
2368        // must be a difference. It may have to do with behavior in the
2369        // presence of NaNs (signaling or quiet). If so, we should add tests
2370        // for those.
2371
2372        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2373        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2374        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2375        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2376
2377        let b1 = !0u32; // a.extract(0) >= b.extract(0)
2378        let c1 = !0u32; // a.extract(0) >= c.extract(0)
2379        let d1 = 0u32; // a.extract(0) >= d.extract(0)
2380
2381        let rb: u32x4 = transmute(_mm_cmpnlt_ss(a, b));
2382        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
2383        assert_eq!(rb, eb);
2384
2385        let rc: u32x4 = transmute(_mm_cmpnlt_ss(a, c));
2386        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
2387        assert_eq!(rc, ec);
2388
2389        let rd: u32x4 = transmute(_mm_cmpnlt_ss(a, d));
2390        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
2391        assert_eq!(rd, ed);
2392    }
2393
2394    #[simd_test(enable = "sse")]
2395    unsafe fn test_mm_cmpnle_ss() {
2396        // TODO: this test is exactly the same as for `_mm_cmpgt_ss`, but there
2397        // must be a difference. It may have to do with behavior in the
2398        // presence
2399        // of NaNs (signaling or quiet). If so, we should add tests for those.
2400
2401        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2402        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2403        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2404        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2405
2406        let b1 = !0u32; // a.extract(0) > b.extract(0)
2407        let c1 = 0u32; // a.extract(0) > c.extract(0)
2408        let d1 = 0u32; // a.extract(0) > d.extract(0)
2409
2410        let rb: u32x4 = transmute(_mm_cmpnle_ss(a, b));
2411        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
2412        assert_eq!(rb, eb);
2413
2414        let rc: u32x4 = transmute(_mm_cmpnle_ss(a, c));
2415        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
2416        assert_eq!(rc, ec);
2417
2418        let rd: u32x4 = transmute(_mm_cmpnle_ss(a, d));
2419        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
2420        assert_eq!(rd, ed);
2421    }
2422
2423    #[simd_test(enable = "sse")]
2424    unsafe fn test_mm_cmpngt_ss() {
2425        // TODO: this test is exactly the same as for `_mm_cmple_ss`, but there
2426        // must be a difference. It may have to do with behavior in the
2427        // presence of NaNs (signaling or quiet). If so, we should add tests
2428        // for those.
2429
2430        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2431        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2432        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2433        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2434
2435        let b1 = 0u32; // a.extract(0) <= b.extract(0)
2436        let c1 = !0u32; // a.extract(0) <= c.extract(0)
2437        let d1 = !0u32; // a.extract(0) <= d.extract(0)
2438
2439        let rb: u32x4 = transmute(_mm_cmpngt_ss(a, b));
2440        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
2441        assert_eq!(rb, eb);
2442
2443        let rc: u32x4 = transmute(_mm_cmpngt_ss(a, c));
2444        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
2445        assert_eq!(rc, ec);
2446
2447        let rd: u32x4 = transmute(_mm_cmpngt_ss(a, d));
2448        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
2449        assert_eq!(rd, ed);
2450    }
2451
2452    #[simd_test(enable = "sse")]
2453    unsafe fn test_mm_cmpnge_ss() {
2454        // TODO: this test is exactly the same as for `_mm_cmplt_ss`, but there
2455        // must be a difference. It may have to do with behavior in the
2456        // presence of NaNs (signaling or quiet). If so, we should add tests
2457        // for those.
2458
2459        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2460        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2461        let c = _mm_setr_ps(1.0, 5.0, 6.0, 7.0);
2462        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2463
2464        let b1 = 0u32; // a.extract(0) < b.extract(0)
2465        let c1 = 0u32; // a.extract(0) < c.extract(0)
2466        let d1 = !0u32; // a.extract(0) < d.extract(0)
2467
2468        let rb: u32x4 = transmute(_mm_cmpnge_ss(a, b));
2469        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
2470        assert_eq!(rb, eb);
2471
2472        let rc: u32x4 = transmute(_mm_cmpnge_ss(a, c));
2473        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
2474        assert_eq!(rc, ec);
2475
2476        let rd: u32x4 = transmute(_mm_cmpnge_ss(a, d));
2477        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
2478        assert_eq!(rd, ed);
2479    }
2480
2481    #[simd_test(enable = "sse")]
2482    unsafe fn test_mm_cmpord_ss() {
2483        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2484        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2485        let c = _mm_setr_ps(NAN, 5.0, 6.0, 7.0);
2486        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2487
2488        let b1 = !0u32; // a.extract(0) ord b.extract(0)
2489        let c1 = 0u32; // a.extract(0) ord c.extract(0)
2490        let d1 = !0u32; // a.extract(0) ord d.extract(0)
2491
2492        let rb: u32x4 = transmute(_mm_cmpord_ss(a, b));
2493        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
2494        assert_eq!(rb, eb);
2495
2496        let rc: u32x4 = transmute(_mm_cmpord_ss(a, c));
2497        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
2498        assert_eq!(rc, ec);
2499
2500        let rd: u32x4 = transmute(_mm_cmpord_ss(a, d));
2501        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
2502        assert_eq!(rd, ed);
2503    }
2504
2505    #[simd_test(enable = "sse")]
2506    unsafe fn test_mm_cmpunord_ss() {
2507        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
2508        let b = _mm_setr_ps(0.0, 5.0, 6.0, 7.0);
2509        let c = _mm_setr_ps(NAN, 5.0, 6.0, 7.0);
2510        let d = _mm_setr_ps(2.0, 5.0, 6.0, 7.0);
2511
2512        let b1 = 0u32; // a.extract(0) unord b.extract(0)
2513        let c1 = !0u32; // a.extract(0) unord c.extract(0)
2514        let d1 = 0u32; // a.extract(0) unord d.extract(0)
2515
2516        let rb: u32x4 = transmute(_mm_cmpunord_ss(a, b));
2517        let eb: u32x4 = transmute(_mm_setr_ps(f32::from_bits(b1), 2.0, 3.0, 4.0));
2518        assert_eq!(rb, eb);
2519
2520        let rc: u32x4 = transmute(_mm_cmpunord_ss(a, c));
2521        let ec: u32x4 = transmute(_mm_setr_ps(f32::from_bits(c1), 2.0, 3.0, 4.0));
2522        assert_eq!(rc, ec);
2523
2524        let rd: u32x4 = transmute(_mm_cmpunord_ss(a, d));
2525        let ed: u32x4 = transmute(_mm_setr_ps(f32::from_bits(d1), 2.0, 3.0, 4.0));
2526        assert_eq!(rd, ed);
2527    }
2528
2529    #[simd_test(enable = "sse")]
2530    unsafe fn test_mm_cmpeq_ps() {
2531        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2532        let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
2533        let tru = !0u32;
2534        let fls = 0u32;
2535
2536        let e = u32x4::new(fls, fls, tru, fls);
2537        let r: u32x4 = transmute(_mm_cmpeq_ps(a, b));
2538        assert_eq!(r, e);
2539    }
2540
2541    #[simd_test(enable = "sse")]
2542    unsafe fn test_mm_cmplt_ps() {
2543        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2544        let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
2545        let tru = !0u32;
2546        let fls = 0u32;
2547
2548        let e = u32x4::new(tru, fls, fls, fls);
2549        let r: u32x4 = transmute(_mm_cmplt_ps(a, b));
2550        assert_eq!(r, e);
2551    }
2552
2553    #[simd_test(enable = "sse")]
2554    unsafe fn test_mm_cmple_ps() {
2555        let a = _mm_setr_ps(10.0, 50.0, 1.0, 4.0);
2556        let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
2557        let tru = !0u32;
2558        let fls = 0u32;
2559
2560        let e = u32x4::new(tru, fls, tru, fls);
2561        let r: u32x4 = transmute(_mm_cmple_ps(a, b));
2562        assert_eq!(r, e);
2563    }
2564
2565    #[simd_test(enable = "sse")]
2566    unsafe fn test_mm_cmpgt_ps() {
2567        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2568        let b = _mm_setr_ps(15.0, 20.0, 1.0, 42.0);
2569        let tru = !0u32;
2570        let fls = 0u32;
2571
2572        let e = u32x4::new(fls, tru, fls, fls);
2573        let r: u32x4 = transmute(_mm_cmpgt_ps(a, b));
2574        assert_eq!(r, e);
2575    }
2576
2577    #[simd_test(enable = "sse")]
2578    unsafe fn test_mm_cmpge_ps() {
2579        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2580        let b = _mm_setr_ps(15.0, 20.0, 1.0, 42.0);
2581        let tru = !0u32;
2582        let fls = 0u32;
2583
2584        let e = u32x4::new(fls, tru, tru, fls);
2585        let r: u32x4 = transmute(_mm_cmpge_ps(a, b));
2586        assert_eq!(r, e);
2587    }
2588
2589    #[simd_test(enable = "sse")]
2590    unsafe fn test_mm_cmpneq_ps() {
2591        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2592        let b = _mm_setr_ps(15.0, 20.0, 1.0, NAN);
2593        let tru = !0u32;
2594        let fls = 0u32;
2595
2596        let e = u32x4::new(tru, tru, fls, tru);
2597        let r: u32x4 = transmute(_mm_cmpneq_ps(a, b));
2598        assert_eq!(r, e);
2599    }
2600
2601    #[simd_test(enable = "sse")]
2602    unsafe fn test_mm_cmpnlt_ps() {
2603        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2604        let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
2605        let tru = !0u32;
2606        let fls = 0u32;
2607
2608        let e = u32x4::new(fls, tru, tru, tru);
2609        let r: u32x4 = transmute(_mm_cmpnlt_ps(a, b));
2610        assert_eq!(r, e);
2611    }
2612
2613    #[simd_test(enable = "sse")]
2614    unsafe fn test_mm_cmpnle_ps() {
2615        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2616        let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
2617        let tru = !0u32;
2618        let fls = 0u32;
2619
2620        let e = u32x4::new(fls, tru, fls, tru);
2621        let r: u32x4 = transmute(_mm_cmpnle_ps(a, b));
2622        assert_eq!(r, e);
2623    }
2624
2625    #[simd_test(enable = "sse")]
2626    unsafe fn test_mm_cmpngt_ps() {
2627        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2628        let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
2629        let tru = !0u32;
2630        let fls = 0u32;
2631
2632        let e = u32x4::new(tru, fls, tru, tru);
2633        let r: u32x4 = transmute(_mm_cmpngt_ps(a, b));
2634        assert_eq!(r, e);
2635    }
2636
2637    #[simd_test(enable = "sse")]
2638    unsafe fn test_mm_cmpnge_ps() {
2639        let a = _mm_setr_ps(10.0, 50.0, 1.0, NAN);
2640        let b = _mm_setr_ps(15.0, 20.0, 1.0, 5.0);
2641        let tru = !0u32;
2642        let fls = 0u32;
2643
2644        let e = u32x4::new(tru, fls, fls, tru);
2645        let r: u32x4 = transmute(_mm_cmpnge_ps(a, b));
2646        assert_eq!(r, e);
2647    }
2648
2649    #[simd_test(enable = "sse")]
2650    unsafe fn test_mm_cmpord_ps() {
2651        let a = _mm_setr_ps(10.0, 50.0, NAN, NAN);
2652        let b = _mm_setr_ps(15.0, NAN, 1.0, NAN);
2653        let tru = !0u32;
2654        let fls = 0u32;
2655
2656        let e = u32x4::new(tru, fls, fls, fls);
2657        let r: u32x4 = transmute(_mm_cmpord_ps(a, b));
2658        assert_eq!(r, e);
2659    }
2660
2661    #[simd_test(enable = "sse")]
2662    unsafe fn test_mm_cmpunord_ps() {
2663        let a = _mm_setr_ps(10.0, 50.0, NAN, NAN);
2664        let b = _mm_setr_ps(15.0, NAN, 1.0, NAN);
2665        let tru = !0u32;
2666        let fls = 0u32;
2667
2668        let e = u32x4::new(fls, tru, tru, tru);
2669        let r: u32x4 = transmute(_mm_cmpunord_ps(a, b));
2670        assert_eq!(r, e);
2671    }
2672
2673    #[simd_test(enable = "sse")]
2674    unsafe fn test_mm_comieq_ss() {
2675        let aa = &[3.0f32, 12.0, 23.0, NAN];
2676        let bb = &[3.0f32, 47.5, 1.5, NAN];
2677
2678        let ee = &[1i32, 0, 0, 0];
2679
2680        for i in 0..4 {
2681            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2682            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2683
2684            let r = _mm_comieq_ss(a, b);
2685
2686            assert_eq!(
2687                ee[i], r,
2688                "_mm_comieq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2689                a, b, r, ee[i], i
2690            );
2691        }
2692    }
2693
2694    #[simd_test(enable = "sse")]
2695    unsafe fn test_mm_comilt_ss() {
2696        let aa = &[3.0f32, 12.0, 23.0, NAN];
2697        let bb = &[3.0f32, 47.5, 1.5, NAN];
2698
2699        let ee = &[0i32, 1, 0, 0];
2700
2701        for i in 0..4 {
2702            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2703            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2704
2705            let r = _mm_comilt_ss(a, b);
2706
2707            assert_eq!(
2708                ee[i], r,
2709                "_mm_comilt_ss({:?}, {:?}) = {}, expected: {} (i={})",
2710                a, b, r, ee[i], i
2711            );
2712        }
2713    }
2714
2715    #[simd_test(enable = "sse")]
2716    unsafe fn test_mm_comile_ss() {
2717        let aa = &[3.0f32, 12.0, 23.0, NAN];
2718        let bb = &[3.0f32, 47.5, 1.5, NAN];
2719
2720        let ee = &[1i32, 1, 0, 0];
2721
2722        for i in 0..4 {
2723            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2724            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2725
2726            let r = _mm_comile_ss(a, b);
2727
2728            assert_eq!(
2729                ee[i], r,
2730                "_mm_comile_ss({:?}, {:?}) = {}, expected: {} (i={})",
2731                a, b, r, ee[i], i
2732            );
2733        }
2734    }
2735
2736    #[simd_test(enable = "sse")]
2737    unsafe fn test_mm_comigt_ss() {
2738        let aa = &[3.0f32, 12.0, 23.0, NAN];
2739        let bb = &[3.0f32, 47.5, 1.5, NAN];
2740
2741        let ee = &[1i32, 0, 1, 0];
2742
2743        for i in 0..4 {
2744            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2745            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2746
2747            let r = _mm_comige_ss(a, b);
2748
2749            assert_eq!(
2750                ee[i], r,
2751                "_mm_comige_ss({:?}, {:?}) = {}, expected: {} (i={})",
2752                a, b, r, ee[i], i
2753            );
2754        }
2755    }
2756
2757    #[simd_test(enable = "sse")]
2758    unsafe fn test_mm_comineq_ss() {
2759        let aa = &[3.0f32, 12.0, 23.0, NAN];
2760        let bb = &[3.0f32, 47.5, 1.5, NAN];
2761
2762        let ee = &[0i32, 1, 1, 1];
2763
2764        for i in 0..4 {
2765            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2766            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2767
2768            let r = _mm_comineq_ss(a, b);
2769
2770            assert_eq!(
2771                ee[i], r,
2772                "_mm_comineq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2773                a, b, r, ee[i], i
2774            );
2775        }
2776    }
2777
2778    #[simd_test(enable = "sse")]
2779    unsafe fn test_mm_ucomieq_ss() {
2780        let aa = &[3.0f32, 12.0, 23.0, NAN];
2781        let bb = &[3.0f32, 47.5, 1.5, NAN];
2782
2783        let ee = &[1i32, 0, 0, 0];
2784
2785        for i in 0..4 {
2786            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2787            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2788
2789            let r = _mm_ucomieq_ss(a, b);
2790
2791            assert_eq!(
2792                ee[i], r,
2793                "_mm_ucomieq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2794                a, b, r, ee[i], i
2795            );
2796        }
2797    }
2798
2799    #[simd_test(enable = "sse")]
2800    unsafe fn test_mm_ucomilt_ss() {
2801        let aa = &[3.0f32, 12.0, 23.0, NAN];
2802        let bb = &[3.0f32, 47.5, 1.5, NAN];
2803
2804        let ee = &[0i32, 1, 0, 0];
2805
2806        for i in 0..4 {
2807            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2808            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2809
2810            let r = _mm_ucomilt_ss(a, b);
2811
2812            assert_eq!(
2813                ee[i], r,
2814                "_mm_ucomilt_ss({:?}, {:?}) = {}, expected: {} (i={})",
2815                a, b, r, ee[i], i
2816            );
2817        }
2818    }
2819
2820    #[simd_test(enable = "sse")]
2821    unsafe fn test_mm_ucomile_ss() {
2822        let aa = &[3.0f32, 12.0, 23.0, NAN];
2823        let bb = &[3.0f32, 47.5, 1.5, NAN];
2824
2825        let ee = &[1i32, 1, 0, 0];
2826
2827        for i in 0..4 {
2828            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2829            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2830
2831            let r = _mm_ucomile_ss(a, b);
2832
2833            assert_eq!(
2834                ee[i], r,
2835                "_mm_ucomile_ss({:?}, {:?}) = {}, expected: {} (i={})",
2836                a, b, r, ee[i], i
2837            );
2838        }
2839    }
2840
2841    #[simd_test(enable = "sse")]
2842    unsafe fn test_mm_ucomigt_ss() {
2843        let aa = &[3.0f32, 12.0, 23.0, NAN];
2844        let bb = &[3.0f32, 47.5, 1.5, NAN];
2845
2846        let ee = &[0i32, 0, 1, 0];
2847
2848        for i in 0..4 {
2849            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2850            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2851
2852            let r = _mm_ucomigt_ss(a, b);
2853
2854            assert_eq!(
2855                ee[i], r,
2856                "_mm_ucomigt_ss({:?}, {:?}) = {}, expected: {} (i={})",
2857                a, b, r, ee[i], i
2858            );
2859        }
2860    }
2861
2862    #[simd_test(enable = "sse")]
2863    unsafe fn test_mm_ucomige_ss() {
2864        let aa = &[3.0f32, 12.0, 23.0, NAN];
2865        let bb = &[3.0f32, 47.5, 1.5, NAN];
2866
2867        let ee = &[1i32, 0, 1, 0];
2868
2869        for i in 0..4 {
2870            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2871            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2872
2873            let r = _mm_ucomige_ss(a, b);
2874
2875            assert_eq!(
2876                ee[i], r,
2877                "_mm_ucomige_ss({:?}, {:?}) = {}, expected: {} (i={})",
2878                a, b, r, ee[i], i
2879            );
2880        }
2881    }
2882
2883    #[simd_test(enable = "sse")]
2884    unsafe fn test_mm_ucomineq_ss() {
2885        let aa = &[3.0f32, 12.0, 23.0, NAN];
2886        let bb = &[3.0f32, 47.5, 1.5, NAN];
2887
2888        let ee = &[0i32, 1, 1, 1];
2889
2890        for i in 0..4 {
2891            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2892            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2893
2894            let r = _mm_ucomineq_ss(a, b);
2895
2896            assert_eq!(
2897                ee[i], r,
2898                "_mm_ucomineq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2899                a, b, r, ee[i], i
2900            );
2901        }
2902    }
2903
2904    #[allow(deprecated)] // FIXME: This test uses deprecated CSR access functions
2905    #[simd_test(enable = "sse")]
2906    #[cfg_attr(miri, ignore)] // Uses _mm_setcsr, which is not supported by Miri
2907    unsafe fn test_mm_comieq_ss_vs_ucomieq_ss() {
2908        // If one of the arguments is a quiet NaN `comieq_ss` should signal an
2909        // Invalid Operation Exception while `ucomieq_ss` should not.
2910        let aa = &[3.0f32, NAN, 23.0, NAN];
2911        let bb = &[3.0f32, 47.5, NAN, NAN];
2912
2913        let ee = &[1i32, 0, 0, 0];
2914        let exc = &[0u32, 1, 1, 1]; // Should comieq_ss signal an exception?
2915
2916        for i in 0..4 {
2917            let a = _mm_setr_ps(aa[i], 1.0, 2.0, 3.0);
2918            let b = _mm_setr_ps(bb[i], 0.0, 2.0, 4.0);
2919
2920            _MM_SET_EXCEPTION_STATE(0);
2921            let r1 = _mm_comieq_ss(*black_box(&a), b);
2922            let s1 = _MM_GET_EXCEPTION_STATE();
2923
2924            _MM_SET_EXCEPTION_STATE(0);
2925            let r2 = _mm_ucomieq_ss(*black_box(&a), b);
2926            let s2 = _MM_GET_EXCEPTION_STATE();
2927
2928            assert_eq!(
2929                ee[i], r1,
2930                "_mm_comeq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2931                a, b, r1, ee[i], i
2932            );
2933            assert_eq!(
2934                ee[i], r2,
2935                "_mm_ucomeq_ss({:?}, {:?}) = {}, expected: {} (i={})",
2936                a, b, r2, ee[i], i
2937            );
2938            assert_eq!(
2939                s1,
2940                exc[i] * _MM_EXCEPT_INVALID,
2941                "_mm_comieq_ss() set exception flags: {} (i={})",
2942                s1,
2943                i
2944            );
2945            assert_eq!(
2946                s2,
2947                0, // ucomieq_ss should not signal an exception
2948                "_mm_ucomieq_ss() set exception flags: {} (i={})",
2949                s2,
2950                i
2951            );
2952        }
2953    }
2954
2955    #[simd_test(enable = "sse")]
2956    unsafe fn test_mm_cvtss_si32() {
2957        let inputs = &[42.0f32, -3.1, 4.0e10, 4.0e-20, NAN, 2147483500.1];
2958        let result = &[42i32, -3, i32::MIN, 0, i32::MIN, 2147483520];
2959        for i in 0..inputs.len() {
2960            let x = _mm_setr_ps(inputs[i], 1.0, 3.0, 4.0);
2961            let e = result[i];
2962            let r = _mm_cvtss_si32(x);
2963            assert_eq!(
2964                e, r,
2965                "TestCase #{} _mm_cvtss_si32({:?}) = {}, expected: {}",
2966                i, x, r, e
2967            );
2968        }
2969    }
2970
2971    #[simd_test(enable = "sse")]
2972    unsafe fn test_mm_cvttss_si32() {
2973        let inputs = &[
2974            (42.0f32, 42i32),
2975            (-31.4, -31),
2976            (-33.5, -33),
2977            (-34.5, -34),
2978            (10.999, 10),
2979            (-5.99, -5),
2980            (4.0e10, i32::MIN),
2981            (4.0e-10, 0),
2982            (NAN, i32::MIN),
2983            (2147483500.1, 2147483520),
2984        ];
2985        for (i, &(xi, e)) in inputs.iter().enumerate() {
2986            let x = _mm_setr_ps(xi, 1.0, 3.0, 4.0);
2987            let r = _mm_cvttss_si32(x);
2988            assert_eq!(
2989                e, r,
2990                "TestCase #{} _mm_cvttss_si32({:?}) = {}, expected: {}",
2991                i, x, r, e
2992            );
2993        }
2994    }
2995
2996    #[simd_test(enable = "sse")]
2997    unsafe fn test_mm_cvtsi32_ss() {
2998        let inputs = &[
2999            (4555i32, 4555.0f32),
3000            (322223333, 322223330.0),
3001            (-432, -432.0),
3002            (-322223333, -322223330.0),
3003        ];
3004
3005        for &(x, f) in inputs.iter() {
3006            let a = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
3007            let r = _mm_cvtsi32_ss(a, x);
3008            let e = _mm_setr_ps(f, 6.0, 7.0, 8.0);
3009            assert_eq_m128(e, r);
3010        }
3011    }
3012
3013    #[simd_test(enable = "sse")]
3014    unsafe fn test_mm_cvtss_f32() {
3015        let a = _mm_setr_ps(312.0134, 5.0, 6.0, 7.0);
3016        assert_eq!(_mm_cvtss_f32(a), 312.0134);
3017    }
3018
3019    #[simd_test(enable = "sse")]
3020    unsafe fn test_mm_set_ss() {
3021        let r = _mm_set_ss(black_box(4.25));
3022        assert_eq_m128(r, _mm_setr_ps(4.25, 0.0, 0.0, 0.0));
3023    }
3024
3025    #[simd_test(enable = "sse")]
3026    unsafe fn test_mm_set1_ps() {
3027        let r1 = _mm_set1_ps(black_box(4.25));
3028        let r2 = _mm_set_ps1(black_box(4.25));
3029        assert_eq!(get_m128(r1, 0), 4.25);
3030        assert_eq!(get_m128(r1, 1), 4.25);
3031        assert_eq!(get_m128(r1, 2), 4.25);
3032        assert_eq!(get_m128(r1, 3), 4.25);
3033        assert_eq!(get_m128(r2, 0), 4.25);
3034        assert_eq!(get_m128(r2, 1), 4.25);
3035        assert_eq!(get_m128(r2, 2), 4.25);
3036        assert_eq!(get_m128(r2, 3), 4.25);
3037    }
3038
3039    #[simd_test(enable = "sse")]
3040    unsafe fn test_mm_set_ps() {
3041        let r = _mm_set_ps(
3042            black_box(1.0),
3043            black_box(2.0),
3044            black_box(3.0),
3045            black_box(4.0),
3046        );
3047        assert_eq!(get_m128(r, 0), 4.0);
3048        assert_eq!(get_m128(r, 1), 3.0);
3049        assert_eq!(get_m128(r, 2), 2.0);
3050        assert_eq!(get_m128(r, 3), 1.0);
3051    }
3052
3053    #[simd_test(enable = "sse")]
3054    unsafe fn test_mm_setr_ps() {
3055        let r = _mm_setr_ps(
3056            black_box(1.0),
3057            black_box(2.0),
3058            black_box(3.0),
3059            black_box(4.0),
3060        );
3061        assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 3.0, 4.0));
3062    }
3063
3064    #[simd_test(enable = "sse")]
3065    unsafe fn test_mm_setzero_ps() {
3066        let r = *black_box(&_mm_setzero_ps());
3067        assert_eq_m128(r, _mm_set1_ps(0.0));
3068    }
3069
3070    #[simd_test(enable = "sse")]
3071    unsafe fn test_mm_shuffle() {
3072        assert_eq!(_MM_SHUFFLE(0, 1, 1, 3), 0b00_01_01_11);
3073        assert_eq!(_MM_SHUFFLE(3, 1, 1, 0), 0b11_01_01_00);
3074        assert_eq!(_MM_SHUFFLE(1, 2, 2, 1), 0b01_10_10_01);
3075    }
3076
3077    #[simd_test(enable = "sse")]
3078    unsafe fn test_mm_shuffle_ps() {
3079        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3080        let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
3081        let r = _mm_shuffle_ps::<0b00_01_01_11>(a, b);
3082        assert_eq_m128(r, _mm_setr_ps(4.0, 2.0, 6.0, 5.0));
3083    }
3084
3085    #[simd_test(enable = "sse")]
3086    unsafe fn test_mm_unpackhi_ps() {
3087        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3088        let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
3089        let r = _mm_unpackhi_ps(a, b);
3090        assert_eq_m128(r, _mm_setr_ps(3.0, 7.0, 4.0, 8.0));
3091    }
3092
3093    #[simd_test(enable = "sse")]
3094    unsafe fn test_mm_unpacklo_ps() {
3095        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3096        let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
3097        let r = _mm_unpacklo_ps(a, b);
3098        assert_eq_m128(r, _mm_setr_ps(1.0, 5.0, 2.0, 6.0));
3099    }
3100
3101    #[simd_test(enable = "sse")]
3102    unsafe fn test_mm_movehl_ps() {
3103        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3104        let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
3105        let r = _mm_movehl_ps(a, b);
3106        assert_eq_m128(r, _mm_setr_ps(7.0, 8.0, 3.0, 4.0));
3107    }
3108
3109    #[simd_test(enable = "sse")]
3110    unsafe fn test_mm_movelh_ps() {
3111        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3112        let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
3113        let r = _mm_movelh_ps(a, b);
3114        assert_eq_m128(r, _mm_setr_ps(1.0, 2.0, 5.0, 6.0));
3115    }
3116
3117    #[simd_test(enable = "sse")]
3118    unsafe fn test_mm_load_ss() {
3119        let a = 42.0f32;
3120        let r = _mm_load_ss(ptr::addr_of!(a));
3121        assert_eq_m128(r, _mm_setr_ps(42.0, 0.0, 0.0, 0.0));
3122    }
3123
3124    #[simd_test(enable = "sse")]
3125    unsafe fn test_mm_load1_ps() {
3126        let a = 42.0f32;
3127        let r = _mm_load1_ps(ptr::addr_of!(a));
3128        assert_eq_m128(r, _mm_setr_ps(42.0, 42.0, 42.0, 42.0));
3129    }
3130
3131    #[simd_test(enable = "sse")]
3132    unsafe fn test_mm_load_ps() {
3133        let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
3134
3135        let mut p = vals.as_ptr();
3136        let mut fixup = 0.0f32;
3137
3138        // Make sure p is aligned, otherwise we might get a
3139        // (signal: 11, SIGSEGV: invalid memory reference)
3140
3141        let unalignment = (p as usize) & 0xf;
3142        if unalignment != 0 {
3143            let delta = (16 - unalignment) >> 2;
3144            fixup = delta as f32;
3145            p = p.add(delta);
3146        }
3147
3148        let r = _mm_load_ps(p);
3149        let e = _mm_add_ps(_mm_setr_ps(1.0, 2.0, 3.0, 4.0), _mm_set1_ps(fixup));
3150        assert_eq_m128(r, e);
3151    }
3152
3153    #[simd_test(enable = "sse")]
3154    unsafe fn test_mm_loadu_ps() {
3155        let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
3156        let p = vals.as_ptr().add(3);
3157        let r = _mm_loadu_ps(black_box(p));
3158        assert_eq_m128(r, _mm_setr_ps(4.0, 5.0, 6.0, 7.0));
3159    }
3160
3161    #[simd_test(enable = "sse")]
3162    unsafe fn test_mm_loadr_ps() {
3163        let vals = &[1.0f32, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0];
3164
3165        let mut p = vals.as_ptr();
3166        let mut fixup = 0.0f32;
3167
3168        // Make sure p is aligned, otherwise we might get a
3169        // (signal: 11, SIGSEGV: invalid memory reference)
3170
3171        let unalignment = (p as usize) & 0xf;
3172        if unalignment != 0 {
3173            let delta = (16 - unalignment) >> 2;
3174            fixup = delta as f32;
3175            p = p.add(delta);
3176        }
3177
3178        let r = _mm_loadr_ps(p);
3179        let e = _mm_add_ps(_mm_setr_ps(4.0, 3.0, 2.0, 1.0), _mm_set1_ps(fixup));
3180        assert_eq_m128(r, e);
3181    }
3182
3183    #[simd_test(enable = "sse")]
3184    unsafe fn test_mm_store_ss() {
3185        let mut vals = [0.0f32; 8];
3186        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3187        _mm_store_ss(vals.as_mut_ptr().add(1), a);
3188
3189        assert_eq!(vals[0], 0.0);
3190        assert_eq!(vals[1], 1.0);
3191        assert_eq!(vals[2], 0.0);
3192    }
3193
3194    #[simd_test(enable = "sse")]
3195    unsafe fn test_mm_store1_ps() {
3196        let mut vals = [0.0f32; 8];
3197        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3198
3199        let mut ofs = 0;
3200        let mut p = vals.as_mut_ptr();
3201
3202        if (p as usize) & 0xf != 0 {
3203            ofs = (16 - ((p as usize) & 0xf)) >> 2;
3204            p = p.add(ofs);
3205        }
3206
3207        _mm_store1_ps(p, *black_box(&a));
3208
3209        if ofs > 0 {
3210            assert_eq!(vals[ofs - 1], 0.0);
3211        }
3212        assert_eq!(vals[ofs + 0], 1.0);
3213        assert_eq!(vals[ofs + 1], 1.0);
3214        assert_eq!(vals[ofs + 2], 1.0);
3215        assert_eq!(vals[ofs + 3], 1.0);
3216        assert_eq!(vals[ofs + 4], 0.0);
3217    }
3218
3219    #[simd_test(enable = "sse")]
3220    unsafe fn test_mm_store_ps() {
3221        let mut vals = [0.0f32; 8];
3222        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3223
3224        let mut ofs = 0;
3225        let mut p = vals.as_mut_ptr();
3226
3227        // Align p to 16-byte boundary
3228        if (p as usize) & 0xf != 0 {
3229            ofs = (16 - ((p as usize) & 0xf)) >> 2;
3230            p = p.add(ofs);
3231        }
3232
3233        _mm_store_ps(p, *black_box(&a));
3234
3235        if ofs > 0 {
3236            assert_eq!(vals[ofs - 1], 0.0);
3237        }
3238        assert_eq!(vals[ofs + 0], 1.0);
3239        assert_eq!(vals[ofs + 1], 2.0);
3240        assert_eq!(vals[ofs + 2], 3.0);
3241        assert_eq!(vals[ofs + 3], 4.0);
3242        assert_eq!(vals[ofs + 4], 0.0);
3243    }
3244
3245    #[simd_test(enable = "sse")]
3246    unsafe fn test_mm_storer_ps() {
3247        let mut vals = [0.0f32; 8];
3248        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3249
3250        let mut ofs = 0;
3251        let mut p = vals.as_mut_ptr();
3252
3253        // Align p to 16-byte boundary
3254        if (p as usize) & 0xf != 0 {
3255            ofs = (16 - ((p as usize) & 0xf)) >> 2;
3256            p = p.add(ofs);
3257        }
3258
3259        _mm_storer_ps(p, *black_box(&a));
3260
3261        if ofs > 0 {
3262            assert_eq!(vals[ofs - 1], 0.0);
3263        }
3264        assert_eq!(vals[ofs + 0], 4.0);
3265        assert_eq!(vals[ofs + 1], 3.0);
3266        assert_eq!(vals[ofs + 2], 2.0);
3267        assert_eq!(vals[ofs + 3], 1.0);
3268        assert_eq!(vals[ofs + 4], 0.0);
3269    }
3270
3271    #[simd_test(enable = "sse")]
3272    unsafe fn test_mm_storeu_ps() {
3273        let mut vals = [0.0f32; 8];
3274        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3275
3276        let mut ofs = 0;
3277        let mut p = vals.as_mut_ptr();
3278
3279        // Make sure p is **not** aligned to 16-byte boundary
3280        if (p as usize) & 0xf == 0 {
3281            ofs = 1;
3282            p = p.add(1);
3283        }
3284
3285        _mm_storeu_ps(p, *black_box(&a));
3286
3287        if ofs > 0 {
3288            assert_eq!(vals[ofs - 1], 0.0);
3289        }
3290        assert_eq!(vals[ofs + 0], 1.0);
3291        assert_eq!(vals[ofs + 1], 2.0);
3292        assert_eq!(vals[ofs + 2], 3.0);
3293        assert_eq!(vals[ofs + 3], 4.0);
3294        assert_eq!(vals[ofs + 4], 0.0);
3295    }
3296
3297    #[simd_test(enable = "sse")]
3298    unsafe fn test_mm_move_ss() {
3299        let a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3300        let b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
3301
3302        let r = _mm_move_ss(a, b);
3303        let e = _mm_setr_ps(5.0, 2.0, 3.0, 4.0);
3304        assert_eq_m128(e, r);
3305    }
3306
3307    #[simd_test(enable = "sse")]
3308    unsafe fn test_mm_movemask_ps() {
3309        let r = _mm_movemask_ps(_mm_setr_ps(-1.0, 5.0, -5.0, 0.0));
3310        assert_eq!(r, 0b0101);
3311
3312        let r = _mm_movemask_ps(_mm_setr_ps(-1.0, -5.0, -5.0, 0.0));
3313        assert_eq!(r, 0b0111);
3314    }
3315
3316    #[simd_test(enable = "sse")]
3317    // Miri cannot support this until it is clear how it fits in the Rust memory model
3318    #[cfg_attr(miri, ignore)]
3319    unsafe fn test_mm_sfence() {
3320        _mm_sfence();
3321    }
3322
3323    #[allow(deprecated)] // FIXME: This tests functions that are immediate UB
3324    #[simd_test(enable = "sse")]
3325    #[cfg_attr(miri, ignore)] // Miri does not support accesing the CSR
3326    unsafe fn test_mm_getcsr_setcsr_1() {
3327        let saved_csr = _mm_getcsr();
3328
3329        let a = _mm_setr_ps(1.1e-36, 0.0, 0.0, 1.0);
3330        let b = _mm_setr_ps(0.001, 0.0, 0.0, 1.0);
3331
3332        _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON);
3333        let r = _mm_mul_ps(*black_box(&a), *black_box(&b));
3334
3335        _mm_setcsr(saved_csr);
3336
3337        let exp = _mm_setr_ps(0.0, 0.0, 0.0, 1.0);
3338        assert_eq_m128(r, exp); // first component is a denormalized f32
3339    }
3340
3341    #[allow(deprecated)] // FIXME: This tests functions that are immediate UB
3342    #[simd_test(enable = "sse")]
3343    #[cfg_attr(miri, ignore)] // Miri does not support accesing the CSR
3344    unsafe fn test_mm_getcsr_setcsr_2() {
3345        // Same as _mm_setcsr_1 test, but with opposite flag value.
3346
3347        let saved_csr = _mm_getcsr();
3348
3349        let a = _mm_setr_ps(1.1e-36, 0.0, 0.0, 1.0);
3350        let b = _mm_setr_ps(0.001, 0.0, 0.0, 1.0);
3351
3352        _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_OFF);
3353        let r = _mm_mul_ps(*black_box(&a), *black_box(&b));
3354
3355        _mm_setcsr(saved_csr);
3356
3357        let exp = _mm_setr_ps(1.1e-39, 0.0, 0.0, 1.0);
3358        assert_eq_m128(r, exp); // first component is a denormalized f32
3359    }
3360
3361    #[allow(deprecated)] // FIXME: This tests functions that are immediate UB
3362    #[simd_test(enable = "sse")]
3363    #[cfg_attr(miri, ignore)] // Miri does not support accesing the CSR
3364    unsafe fn test_mm_getcsr_setcsr_underflow() {
3365        _MM_SET_EXCEPTION_STATE(0);
3366
3367        let a = _mm_setr_ps(1.1e-36, 0.0, 0.0, 1.0);
3368        let b = _mm_setr_ps(1e-5, 0.0, 0.0, 1.0);
3369
3370        assert_eq!(_MM_GET_EXCEPTION_STATE(), 0); // just to be sure
3371
3372        let r = _mm_mul_ps(*black_box(&a), *black_box(&b));
3373
3374        let exp = _mm_setr_ps(1.1e-41, 0.0, 0.0, 1.0);
3375        assert_eq_m128(r, exp);
3376
3377        let underflow = _MM_GET_EXCEPTION_STATE() & _MM_EXCEPT_UNDERFLOW != 0;
3378        assert!(underflow);
3379    }
3380
3381    #[simd_test(enable = "sse")]
3382    unsafe fn test_MM_TRANSPOSE4_PS() {
3383        let mut a = _mm_setr_ps(1.0, 2.0, 3.0, 4.0);
3384        let mut b = _mm_setr_ps(5.0, 6.0, 7.0, 8.0);
3385        let mut c = _mm_setr_ps(9.0, 10.0, 11.0, 12.0);
3386        let mut d = _mm_setr_ps(13.0, 14.0, 15.0, 16.0);
3387
3388        _MM_TRANSPOSE4_PS(&mut a, &mut b, &mut c, &mut d);
3389
3390        assert_eq_m128(a, _mm_setr_ps(1.0, 5.0, 9.0, 13.0));
3391        assert_eq_m128(b, _mm_setr_ps(2.0, 6.0, 10.0, 14.0));
3392        assert_eq_m128(c, _mm_setr_ps(3.0, 7.0, 11.0, 15.0));
3393        assert_eq_m128(d, _mm_setr_ps(4.0, 8.0, 12.0, 16.0));
3394    }
3395
3396    #[repr(align(16))]
3397    struct Memory {
3398        pub data: [f32; 4],
3399    }
3400
3401    #[simd_test(enable = "sse")]
3402    // Miri cannot support this until it is clear how it fits in the Rust memory model
3403    // (non-temporal store)
3404    #[cfg_attr(miri, ignore)]
3405    unsafe fn test_mm_stream_ps() {
3406        let a = _mm_set1_ps(7.0);
3407        let mut mem = Memory { data: [-1.0; 4] };
3408
3409        _mm_stream_ps(ptr::addr_of_mut!(mem.data[0]), a);
3410        for i in 0..4 {
3411            assert_eq!(mem.data[i], get_m128(a, i));
3412        }
3413    }
3414}