core/stdarch/crates/core_arch/src/x86/
avx2.rs

1//! Advanced Vector Extensions 2 (AVX)
2//!
3//! AVX2 expands most AVX commands to 256-bit wide vector registers and
4//! adds [FMA](https://en.wikipedia.org/wiki/Fused_multiply-accumulate).
5//!
6//! The references are:
7//!
8//! - [Intel 64 and IA-32 Architectures Software Developer's Manual Volume 2:
9//!   Instruction Set Reference, A-Z][intel64_ref].
10//! - [AMD64 Architecture Programmer's Manual, Volume 3: General-Purpose and
11//!   System Instructions][amd64_ref].
12//!
13//! Wikipedia's [AVX][wiki_avx] and [FMA][wiki_fma] pages provide a quick
14//! overview of the instructions available.
15//!
16//! [intel64_ref]: http://www.intel.de/content/dam/www/public/us/en/documents/manuals/64-ia-32-architectures-software-developer-instruction-set-reference-manual-325383.pdf
17//! [amd64_ref]: http://support.amd.com/TechDocs/24594.pdf
18//! [wiki_avx]: https://en.wikipedia.org/wiki/Advanced_Vector_Extensions
19//! [wiki_fma]: https://en.wikipedia.org/wiki/Fused_multiply-accumulate
20
21use crate::core_arch::{simd::*, x86::*};
22use crate::intrinsics::simd::*;
23
24#[cfg(test)]
25use stdarch_test::assert_instr;
26
27/// Computes the absolute values of packed 32-bit integers in `a`.
28///
29/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi32)
30#[inline]
31#[target_feature(enable = "avx2")]
32#[cfg_attr(test, assert_instr(vpabsd))]
33#[stable(feature = "simd_x86", since = "1.27.0")]
34pub fn _mm256_abs_epi32(a: __m256i) -> __m256i {
35    unsafe {
36        let a = a.as_i32x8();
37        let r = simd_select::<m32x8, _>(simd_lt(a, i32x8::ZERO), simd_neg(a), a);
38        transmute(r)
39    }
40}
41
42/// Computes the absolute values of packed 16-bit integers in `a`.
43///
44/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi16)
45#[inline]
46#[target_feature(enable = "avx2")]
47#[cfg_attr(test, assert_instr(vpabsw))]
48#[stable(feature = "simd_x86", since = "1.27.0")]
49pub fn _mm256_abs_epi16(a: __m256i) -> __m256i {
50    unsafe {
51        let a = a.as_i16x16();
52        let r = simd_select::<m16x16, _>(simd_lt(a, i16x16::ZERO), simd_neg(a), a);
53        transmute(r)
54    }
55}
56
57/// Computes the absolute values of packed 8-bit integers in `a`.
58///
59/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_abs_epi8)
60#[inline]
61#[target_feature(enable = "avx2")]
62#[cfg_attr(test, assert_instr(vpabsb))]
63#[stable(feature = "simd_x86", since = "1.27.0")]
64pub fn _mm256_abs_epi8(a: __m256i) -> __m256i {
65    unsafe {
66        let a = a.as_i8x32();
67        let r = simd_select::<m8x32, _>(simd_lt(a, i8x32::ZERO), simd_neg(a), a);
68        transmute(r)
69    }
70}
71
72/// Adds packed 64-bit integers in `a` and `b`.
73///
74/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi64)
75#[inline]
76#[target_feature(enable = "avx2")]
77#[cfg_attr(test, assert_instr(vpaddq))]
78#[stable(feature = "simd_x86", since = "1.27.0")]
79pub fn _mm256_add_epi64(a: __m256i, b: __m256i) -> __m256i {
80    unsafe { transmute(simd_add(a.as_i64x4(), b.as_i64x4())) }
81}
82
83/// Adds packed 32-bit integers in `a` and `b`.
84///
85/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi32)
86#[inline]
87#[target_feature(enable = "avx2")]
88#[cfg_attr(test, assert_instr(vpaddd))]
89#[stable(feature = "simd_x86", since = "1.27.0")]
90pub fn _mm256_add_epi32(a: __m256i, b: __m256i) -> __m256i {
91    unsafe { transmute(simd_add(a.as_i32x8(), b.as_i32x8())) }
92}
93
94/// Adds packed 16-bit integers in `a` and `b`.
95///
96/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi16)
97#[inline]
98#[target_feature(enable = "avx2")]
99#[cfg_attr(test, assert_instr(vpaddw))]
100#[stable(feature = "simd_x86", since = "1.27.0")]
101pub fn _mm256_add_epi16(a: __m256i, b: __m256i) -> __m256i {
102    unsafe { transmute(simd_add(a.as_i16x16(), b.as_i16x16())) }
103}
104
105/// Adds packed 8-bit integers in `a` and `b`.
106///
107/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_add_epi8)
108#[inline]
109#[target_feature(enable = "avx2")]
110#[cfg_attr(test, assert_instr(vpaddb))]
111#[stable(feature = "simd_x86", since = "1.27.0")]
112pub fn _mm256_add_epi8(a: __m256i, b: __m256i) -> __m256i {
113    unsafe { transmute(simd_add(a.as_i8x32(), b.as_i8x32())) }
114}
115
116/// Adds packed 8-bit integers in `a` and `b` using saturation.
117///
118/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epi8)
119#[inline]
120#[target_feature(enable = "avx2")]
121#[cfg_attr(test, assert_instr(vpaddsb))]
122#[stable(feature = "simd_x86", since = "1.27.0")]
123pub fn _mm256_adds_epi8(a: __m256i, b: __m256i) -> __m256i {
124    unsafe { transmute(simd_saturating_add(a.as_i8x32(), b.as_i8x32())) }
125}
126
127/// Adds packed 16-bit integers in `a` and `b` using saturation.
128///
129/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epi16)
130#[inline]
131#[target_feature(enable = "avx2")]
132#[cfg_attr(test, assert_instr(vpaddsw))]
133#[stable(feature = "simd_x86", since = "1.27.0")]
134pub fn _mm256_adds_epi16(a: __m256i, b: __m256i) -> __m256i {
135    unsafe { transmute(simd_saturating_add(a.as_i16x16(), b.as_i16x16())) }
136}
137
138/// Adds packed unsigned 8-bit integers in `a` and `b` using saturation.
139///
140/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epu8)
141#[inline]
142#[target_feature(enable = "avx2")]
143#[cfg_attr(test, assert_instr(vpaddusb))]
144#[stable(feature = "simd_x86", since = "1.27.0")]
145pub fn _mm256_adds_epu8(a: __m256i, b: __m256i) -> __m256i {
146    unsafe { transmute(simd_saturating_add(a.as_u8x32(), b.as_u8x32())) }
147}
148
149/// Adds packed unsigned 16-bit integers in `a` and `b` using saturation.
150///
151/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_adds_epu16)
152#[inline]
153#[target_feature(enable = "avx2")]
154#[cfg_attr(test, assert_instr(vpaddusw))]
155#[stable(feature = "simd_x86", since = "1.27.0")]
156pub fn _mm256_adds_epu16(a: __m256i, b: __m256i) -> __m256i {
157    unsafe { transmute(simd_saturating_add(a.as_u16x16(), b.as_u16x16())) }
158}
159
160/// Concatenates pairs of 16-byte blocks in `a` and `b` into a 32-byte temporary
161/// result, shifts the result right by `n` bytes, and returns the low 16 bytes.
162///
163/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_alignr_epi8)
164#[inline]
165#[target_feature(enable = "avx2")]
166#[cfg_attr(test, assert_instr(vpalignr, IMM8 = 7))]
167#[rustc_legacy_const_generics(2)]
168#[stable(feature = "simd_x86", since = "1.27.0")]
169pub fn _mm256_alignr_epi8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
170    static_assert_uimm_bits!(IMM8, 8);
171
172    // If palignr is shifting the pair of vectors more than the size of two
173    // lanes, emit zero.
174    if IMM8 >= 32 {
175        return _mm256_setzero_si256();
176    }
177    // If palignr is shifting the pair of input vectors more than one lane,
178    // but less than two lanes, convert to shifting in zeroes.
179    let (a, b) = if IMM8 > 16 {
180        (_mm256_setzero_si256(), a)
181    } else {
182        (a, b)
183    };
184    unsafe {
185        if IMM8 == 16 {
186            return transmute(a);
187        }
188    }
189    const fn mask(shift: u32, i: u32) -> u32 {
190        let shift = shift % 16;
191        let mod_i = i % 16;
192        if mod_i < (16 - shift) {
193            i + shift
194        } else {
195            i + 16 + shift
196        }
197    }
198
199    unsafe {
200        let r: i8x32 = simd_shuffle!(
201            b.as_i8x32(),
202            a.as_i8x32(),
203            [
204                mask(IMM8 as u32, 0),
205                mask(IMM8 as u32, 1),
206                mask(IMM8 as u32, 2),
207                mask(IMM8 as u32, 3),
208                mask(IMM8 as u32, 4),
209                mask(IMM8 as u32, 5),
210                mask(IMM8 as u32, 6),
211                mask(IMM8 as u32, 7),
212                mask(IMM8 as u32, 8),
213                mask(IMM8 as u32, 9),
214                mask(IMM8 as u32, 10),
215                mask(IMM8 as u32, 11),
216                mask(IMM8 as u32, 12),
217                mask(IMM8 as u32, 13),
218                mask(IMM8 as u32, 14),
219                mask(IMM8 as u32, 15),
220                mask(IMM8 as u32, 16),
221                mask(IMM8 as u32, 17),
222                mask(IMM8 as u32, 18),
223                mask(IMM8 as u32, 19),
224                mask(IMM8 as u32, 20),
225                mask(IMM8 as u32, 21),
226                mask(IMM8 as u32, 22),
227                mask(IMM8 as u32, 23),
228                mask(IMM8 as u32, 24),
229                mask(IMM8 as u32, 25),
230                mask(IMM8 as u32, 26),
231                mask(IMM8 as u32, 27),
232                mask(IMM8 as u32, 28),
233                mask(IMM8 as u32, 29),
234                mask(IMM8 as u32, 30),
235                mask(IMM8 as u32, 31),
236            ],
237        );
238        transmute(r)
239    }
240}
241
242/// Computes the bitwise AND of 256 bits (representing integer data)
243/// in `a` and `b`.
244///
245/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_and_si256)
246#[inline]
247#[target_feature(enable = "avx2")]
248#[cfg_attr(test, assert_instr(vandps))]
249#[stable(feature = "simd_x86", since = "1.27.0")]
250pub fn _mm256_and_si256(a: __m256i, b: __m256i) -> __m256i {
251    unsafe { transmute(simd_and(a.as_i64x4(), b.as_i64x4())) }
252}
253
254/// Computes the bitwise NOT of 256 bits (representing integer data)
255/// in `a` and then AND with `b`.
256///
257/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_andnot_si256)
258#[inline]
259#[target_feature(enable = "avx2")]
260#[cfg_attr(test, assert_instr(vandnps))]
261#[stable(feature = "simd_x86", since = "1.27.0")]
262pub fn _mm256_andnot_si256(a: __m256i, b: __m256i) -> __m256i {
263    unsafe {
264        let all_ones = _mm256_set1_epi8(-1);
265        transmute(simd_and(
266            simd_xor(a.as_i64x4(), all_ones.as_i64x4()),
267            b.as_i64x4(),
268        ))
269    }
270}
271
272/// Averages packed unsigned 16-bit integers in `a` and `b`.
273///
274/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_avg_epu16)
275#[inline]
276#[target_feature(enable = "avx2")]
277#[cfg_attr(test, assert_instr(vpavgw))]
278#[stable(feature = "simd_x86", since = "1.27.0")]
279pub fn _mm256_avg_epu16(a: __m256i, b: __m256i) -> __m256i {
280    unsafe {
281        let a = simd_cast::<_, u32x16>(a.as_u16x16());
282        let b = simd_cast::<_, u32x16>(b.as_u16x16());
283        let r = simd_shr(simd_add(simd_add(a, b), u32x16::splat(1)), u32x16::splat(1));
284        transmute(simd_cast::<_, u16x16>(r))
285    }
286}
287
288/// Averages packed unsigned 8-bit integers in `a` and `b`.
289///
290/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_avg_epu8)
291#[inline]
292#[target_feature(enable = "avx2")]
293#[cfg_attr(test, assert_instr(vpavgb))]
294#[stable(feature = "simd_x86", since = "1.27.0")]
295pub fn _mm256_avg_epu8(a: __m256i, b: __m256i) -> __m256i {
296    unsafe {
297        let a = simd_cast::<_, u16x32>(a.as_u8x32());
298        let b = simd_cast::<_, u16x32>(b.as_u8x32());
299        let r = simd_shr(simd_add(simd_add(a, b), u16x32::splat(1)), u16x32::splat(1));
300        transmute(simd_cast::<_, u8x32>(r))
301    }
302}
303
304/// Blends packed 32-bit integers from `a` and `b` using control mask `IMM4`.
305///
306/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi32)
307#[inline]
308#[target_feature(enable = "avx2")]
309#[cfg_attr(test, assert_instr(vblendps, IMM4 = 9))]
310#[rustc_legacy_const_generics(2)]
311#[stable(feature = "simd_x86", since = "1.27.0")]
312pub fn _mm_blend_epi32<const IMM4: i32>(a: __m128i, b: __m128i) -> __m128i {
313    static_assert_uimm_bits!(IMM4, 4);
314    unsafe {
315        let a = a.as_i32x4();
316        let b = b.as_i32x4();
317        let r: i32x4 = simd_shuffle!(
318            a,
319            b,
320            [
321                [0, 4, 0, 4][IMM4 as usize & 0b11],
322                [1, 1, 5, 5][IMM4 as usize & 0b11],
323                [2, 6, 2, 6][(IMM4 as usize >> 2) & 0b11],
324                [3, 3, 7, 7][(IMM4 as usize >> 2) & 0b11],
325            ],
326        );
327        transmute(r)
328    }
329}
330
331/// Blends packed 32-bit integers from `a` and `b` using control mask `IMM8`.
332///
333/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_epi32)
334#[inline]
335#[target_feature(enable = "avx2")]
336#[cfg_attr(test, assert_instr(vblendps, IMM8 = 9))]
337#[rustc_legacy_const_generics(2)]
338#[stable(feature = "simd_x86", since = "1.27.0")]
339pub fn _mm256_blend_epi32<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
340    static_assert_uimm_bits!(IMM8, 8);
341    unsafe {
342        let a = a.as_i32x8();
343        let b = b.as_i32x8();
344        let r: i32x8 = simd_shuffle!(
345            a,
346            b,
347            [
348                [0, 8, 0, 8][IMM8 as usize & 0b11],
349                [1, 1, 9, 9][IMM8 as usize & 0b11],
350                [2, 10, 2, 10][(IMM8 as usize >> 2) & 0b11],
351                [3, 3, 11, 11][(IMM8 as usize >> 2) & 0b11],
352                [4, 12, 4, 12][(IMM8 as usize >> 4) & 0b11],
353                [5, 5, 13, 13][(IMM8 as usize >> 4) & 0b11],
354                [6, 14, 6, 14][(IMM8 as usize >> 6) & 0b11],
355                [7, 7, 15, 15][(IMM8 as usize >> 6) & 0b11],
356            ],
357        );
358        transmute(r)
359    }
360}
361
362/// Blends packed 16-bit integers from `a` and `b` using control mask `IMM8`.
363///
364/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blend_epi16)
365#[inline]
366#[target_feature(enable = "avx2")]
367#[cfg_attr(test, assert_instr(vpblendw, IMM8 = 9))]
368#[rustc_legacy_const_generics(2)]
369#[stable(feature = "simd_x86", since = "1.27.0")]
370pub fn _mm256_blend_epi16<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
371    static_assert_uimm_bits!(IMM8, 8);
372    unsafe {
373        let a = a.as_i16x16();
374        let b = b.as_i16x16();
375
376        let r: i16x16 = simd_shuffle!(
377            a,
378            b,
379            [
380                [0, 16, 0, 16][IMM8 as usize & 0b11],
381                [1, 1, 17, 17][IMM8 as usize & 0b11],
382                [2, 18, 2, 18][(IMM8 as usize >> 2) & 0b11],
383                [3, 3, 19, 19][(IMM8 as usize >> 2) & 0b11],
384                [4, 20, 4, 20][(IMM8 as usize >> 4) & 0b11],
385                [5, 5, 21, 21][(IMM8 as usize >> 4) & 0b11],
386                [6, 22, 6, 22][(IMM8 as usize >> 6) & 0b11],
387                [7, 7, 23, 23][(IMM8 as usize >> 6) & 0b11],
388                [8, 24, 8, 24][IMM8 as usize & 0b11],
389                [9, 9, 25, 25][IMM8 as usize & 0b11],
390                [10, 26, 10, 26][(IMM8 as usize >> 2) & 0b11],
391                [11, 11, 27, 27][(IMM8 as usize >> 2) & 0b11],
392                [12, 28, 12, 28][(IMM8 as usize >> 4) & 0b11],
393                [13, 13, 29, 29][(IMM8 as usize >> 4) & 0b11],
394                [14, 30, 14, 30][(IMM8 as usize >> 6) & 0b11],
395                [15, 15, 31, 31][(IMM8 as usize >> 6) & 0b11],
396            ],
397        );
398        transmute(r)
399    }
400}
401
402/// Blends packed 8-bit integers from `a` and `b` using `mask`.
403///
404/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_blendv_epi8)
405#[inline]
406#[target_feature(enable = "avx2")]
407#[cfg_attr(test, assert_instr(vpblendvb))]
408#[stable(feature = "simd_x86", since = "1.27.0")]
409pub fn _mm256_blendv_epi8(a: __m256i, b: __m256i, mask: __m256i) -> __m256i {
410    unsafe {
411        let mask: i8x32 = simd_lt(mask.as_i8x32(), i8x32::ZERO);
412        transmute(simd_select(mask, b.as_i8x32(), a.as_i8x32()))
413    }
414}
415
416/// Broadcasts the low packed 8-bit integer from `a` to all elements of
417/// the 128-bit returned value.
418///
419/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastb_epi8)
420#[inline]
421#[target_feature(enable = "avx2")]
422#[cfg_attr(test, assert_instr(vpbroadcastb))]
423#[stable(feature = "simd_x86", since = "1.27.0")]
424pub fn _mm_broadcastb_epi8(a: __m128i) -> __m128i {
425    unsafe {
426        let ret = simd_shuffle!(a.as_i8x16(), i8x16::ZERO, [0_u32; 16]);
427        transmute::<i8x16, _>(ret)
428    }
429}
430
431/// Broadcasts the low packed 8-bit integer from `a` to all elements of
432/// the 256-bit returned value.
433///
434/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastb_epi8)
435#[inline]
436#[target_feature(enable = "avx2")]
437#[cfg_attr(test, assert_instr(vpbroadcastb))]
438#[stable(feature = "simd_x86", since = "1.27.0")]
439pub fn _mm256_broadcastb_epi8(a: __m128i) -> __m256i {
440    unsafe {
441        let ret = simd_shuffle!(a.as_i8x16(), i8x16::ZERO, [0_u32; 32]);
442        transmute::<i8x32, _>(ret)
443    }
444}
445
446// N.B., `simd_shuffle4` with integer data types for `a` and `b` is
447// often compiled to `vbroadcastss`.
448/// Broadcasts the low packed 32-bit integer from `a` to all elements of
449/// the 128-bit returned value.
450///
451/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastd_epi32)
452#[inline]
453#[target_feature(enable = "avx2")]
454#[cfg_attr(test, assert_instr(vbroadcastss))]
455#[stable(feature = "simd_x86", since = "1.27.0")]
456pub fn _mm_broadcastd_epi32(a: __m128i) -> __m128i {
457    unsafe {
458        let ret = simd_shuffle!(a.as_i32x4(), i32x4::ZERO, [0_u32; 4]);
459        transmute::<i32x4, _>(ret)
460    }
461}
462
463// N.B., `simd_shuffle4`` with integer data types for `a` and `b` is
464// often compiled to `vbroadcastss`.
465/// Broadcasts the low packed 32-bit integer from `a` to all elements of
466/// the 256-bit returned value.
467///
468/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastd_epi32)
469#[inline]
470#[target_feature(enable = "avx2")]
471#[cfg_attr(test, assert_instr(vbroadcastss))]
472#[stable(feature = "simd_x86", since = "1.27.0")]
473pub fn _mm256_broadcastd_epi32(a: __m128i) -> __m256i {
474    unsafe {
475        let ret = simd_shuffle!(a.as_i32x4(), i32x4::ZERO, [0_u32; 8]);
476        transmute::<i32x8, _>(ret)
477    }
478}
479
480/// Broadcasts the low packed 64-bit integer from `a` to all elements of
481/// the 128-bit returned value.
482///
483/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastq_epi64)
484#[inline]
485#[target_feature(enable = "avx2")]
486// Emits `vmovddup` instead of `vpbroadcastq`
487// See https://github.com/rust-lang/stdarch/issues/791
488#[cfg_attr(test, assert_instr(vmovddup))]
489#[stable(feature = "simd_x86", since = "1.27.0")]
490pub fn _mm_broadcastq_epi64(a: __m128i) -> __m128i {
491    unsafe {
492        let ret = simd_shuffle!(a.as_i64x2(), a.as_i64x2(), [0_u32; 2]);
493        transmute::<i64x2, _>(ret)
494    }
495}
496
497/// Broadcasts the low packed 64-bit integer from `a` to all elements of
498/// the 256-bit returned value.
499///
500/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastq_epi64)
501#[inline]
502#[target_feature(enable = "avx2")]
503#[cfg_attr(test, assert_instr(vbroadcastsd))]
504#[stable(feature = "simd_x86", since = "1.27.0")]
505pub fn _mm256_broadcastq_epi64(a: __m128i) -> __m256i {
506    unsafe {
507        let ret = simd_shuffle!(a.as_i64x2(), a.as_i64x2(), [0_u32; 4]);
508        transmute::<i64x4, _>(ret)
509    }
510}
511
512/// Broadcasts the low double-precision (64-bit) floating-point element
513/// from `a` to all elements of the 128-bit returned value.
514///
515/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastsd_pd)
516#[inline]
517#[target_feature(enable = "avx2")]
518#[cfg_attr(test, assert_instr(vmovddup))]
519#[stable(feature = "simd_x86", since = "1.27.0")]
520pub fn _mm_broadcastsd_pd(a: __m128d) -> __m128d {
521    unsafe { simd_shuffle!(a, _mm_setzero_pd(), [0_u32; 2]) }
522}
523
524/// Broadcasts the low double-precision (64-bit) floating-point element
525/// from `a` to all elements of the 256-bit returned value.
526///
527/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastsd_pd)
528#[inline]
529#[target_feature(enable = "avx2")]
530#[cfg_attr(test, assert_instr(vbroadcastsd))]
531#[stable(feature = "simd_x86", since = "1.27.0")]
532pub fn _mm256_broadcastsd_pd(a: __m128d) -> __m256d {
533    unsafe { simd_shuffle!(a, _mm_setzero_pd(), [0_u32; 4]) }
534}
535
536/// Broadcasts 128 bits of integer data from a to all 128-bit lanes in
537/// the 256-bit returned value.
538///
539/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastsi128_si256)
540#[inline]
541#[target_feature(enable = "avx2")]
542#[stable(feature = "simd_x86_updates", since = "1.82.0")]
543pub fn _mm_broadcastsi128_si256(a: __m128i) -> __m256i {
544    unsafe {
545        let ret = simd_shuffle!(a.as_i64x2(), i64x2::ZERO, [0, 1, 0, 1]);
546        transmute::<i64x4, _>(ret)
547    }
548}
549
550// N.B., `broadcastsi128_si256` is often compiled to `vinsertf128` or
551// `vbroadcastf128`.
552/// Broadcasts 128 bits of integer data from a to all 128-bit lanes in
553/// the 256-bit returned value.
554///
555/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastsi128_si256)
556#[inline]
557#[target_feature(enable = "avx2")]
558#[stable(feature = "simd_x86", since = "1.27.0")]
559pub fn _mm256_broadcastsi128_si256(a: __m128i) -> __m256i {
560    unsafe {
561        let ret = simd_shuffle!(a.as_i64x2(), i64x2::ZERO, [0, 1, 0, 1]);
562        transmute::<i64x4, _>(ret)
563    }
564}
565
566/// Broadcasts the low single-precision (32-bit) floating-point element
567/// from `a` to all elements of the 128-bit returned value.
568///
569/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastss_ps)
570#[inline]
571#[target_feature(enable = "avx2")]
572#[cfg_attr(test, assert_instr(vbroadcastss))]
573#[stable(feature = "simd_x86", since = "1.27.0")]
574pub fn _mm_broadcastss_ps(a: __m128) -> __m128 {
575    unsafe { simd_shuffle!(a, _mm_setzero_ps(), [0_u32; 4]) }
576}
577
578/// Broadcasts the low single-precision (32-bit) floating-point element
579/// from `a` to all elements of the 256-bit returned value.
580///
581/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastss_ps)
582#[inline]
583#[target_feature(enable = "avx2")]
584#[cfg_attr(test, assert_instr(vbroadcastss))]
585#[stable(feature = "simd_x86", since = "1.27.0")]
586pub fn _mm256_broadcastss_ps(a: __m128) -> __m256 {
587    unsafe { simd_shuffle!(a, _mm_setzero_ps(), [0_u32; 8]) }
588}
589
590/// Broadcasts the low packed 16-bit integer from a to all elements of
591/// the 128-bit returned value
592///
593/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_broadcastw_epi16)
594#[inline]
595#[target_feature(enable = "avx2")]
596#[cfg_attr(test, assert_instr(vpbroadcastw))]
597#[stable(feature = "simd_x86", since = "1.27.0")]
598pub fn _mm_broadcastw_epi16(a: __m128i) -> __m128i {
599    unsafe {
600        let ret = simd_shuffle!(a.as_i16x8(), i16x8::ZERO, [0_u32; 8]);
601        transmute::<i16x8, _>(ret)
602    }
603}
604
605/// Broadcasts the low packed 16-bit integer from a to all elements of
606/// the 256-bit returned value
607///
608/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_broadcastw_epi16)
609#[inline]
610#[target_feature(enable = "avx2")]
611#[cfg_attr(test, assert_instr(vpbroadcastw))]
612#[stable(feature = "simd_x86", since = "1.27.0")]
613pub fn _mm256_broadcastw_epi16(a: __m128i) -> __m256i {
614    unsafe {
615        let ret = simd_shuffle!(a.as_i16x8(), i16x8::ZERO, [0_u32; 16]);
616        transmute::<i16x16, _>(ret)
617    }
618}
619
620/// Compares packed 64-bit integers in `a` and `b` for equality.
621///
622/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi64)
623#[inline]
624#[target_feature(enable = "avx2")]
625#[cfg_attr(test, assert_instr(vpcmpeqq))]
626#[stable(feature = "simd_x86", since = "1.27.0")]
627pub fn _mm256_cmpeq_epi64(a: __m256i, b: __m256i) -> __m256i {
628    unsafe { transmute::<i64x4, _>(simd_eq(a.as_i64x4(), b.as_i64x4())) }
629}
630
631/// Compares packed 32-bit integers in `a` and `b` for equality.
632///
633/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi32)
634#[inline]
635#[target_feature(enable = "avx2")]
636#[cfg_attr(test, assert_instr(vpcmpeqd))]
637#[stable(feature = "simd_x86", since = "1.27.0")]
638pub fn _mm256_cmpeq_epi32(a: __m256i, b: __m256i) -> __m256i {
639    unsafe { transmute::<i32x8, _>(simd_eq(a.as_i32x8(), b.as_i32x8())) }
640}
641
642/// Compares packed 16-bit integers in `a` and `b` for equality.
643///
644/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi16)
645#[inline]
646#[target_feature(enable = "avx2")]
647#[cfg_attr(test, assert_instr(vpcmpeqw))]
648#[stable(feature = "simd_x86", since = "1.27.0")]
649pub fn _mm256_cmpeq_epi16(a: __m256i, b: __m256i) -> __m256i {
650    unsafe { transmute::<i16x16, _>(simd_eq(a.as_i16x16(), b.as_i16x16())) }
651}
652
653/// Compares packed 8-bit integers in `a` and `b` for equality.
654///
655/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpeq_epi8)
656#[inline]
657#[target_feature(enable = "avx2")]
658#[cfg_attr(test, assert_instr(vpcmpeqb))]
659#[stable(feature = "simd_x86", since = "1.27.0")]
660pub fn _mm256_cmpeq_epi8(a: __m256i, b: __m256i) -> __m256i {
661    unsafe { transmute::<i8x32, _>(simd_eq(a.as_i8x32(), b.as_i8x32())) }
662}
663
664/// Compares packed 64-bit integers in `a` and `b` for greater-than.
665///
666/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi64)
667#[inline]
668#[target_feature(enable = "avx2")]
669#[cfg_attr(test, assert_instr(vpcmpgtq))]
670#[stable(feature = "simd_x86", since = "1.27.0")]
671pub fn _mm256_cmpgt_epi64(a: __m256i, b: __m256i) -> __m256i {
672    unsafe { transmute::<i64x4, _>(simd_gt(a.as_i64x4(), b.as_i64x4())) }
673}
674
675/// Compares packed 32-bit integers in `a` and `b` for greater-than.
676///
677/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi32)
678#[inline]
679#[target_feature(enable = "avx2")]
680#[cfg_attr(test, assert_instr(vpcmpgtd))]
681#[stable(feature = "simd_x86", since = "1.27.0")]
682pub fn _mm256_cmpgt_epi32(a: __m256i, b: __m256i) -> __m256i {
683    unsafe { transmute::<i32x8, _>(simd_gt(a.as_i32x8(), b.as_i32x8())) }
684}
685
686/// Compares packed 16-bit integers in `a` and `b` for greater-than.
687///
688/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi16)
689#[inline]
690#[target_feature(enable = "avx2")]
691#[cfg_attr(test, assert_instr(vpcmpgtw))]
692#[stable(feature = "simd_x86", since = "1.27.0")]
693pub fn _mm256_cmpgt_epi16(a: __m256i, b: __m256i) -> __m256i {
694    unsafe { transmute::<i16x16, _>(simd_gt(a.as_i16x16(), b.as_i16x16())) }
695}
696
697/// Compares packed 8-bit integers in `a` and `b` for greater-than.
698///
699/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cmpgt_epi8)
700#[inline]
701#[target_feature(enable = "avx2")]
702#[cfg_attr(test, assert_instr(vpcmpgtb))]
703#[stable(feature = "simd_x86", since = "1.27.0")]
704pub fn _mm256_cmpgt_epi8(a: __m256i, b: __m256i) -> __m256i {
705    unsafe { transmute::<i8x32, _>(simd_gt(a.as_i8x32(), b.as_i8x32())) }
706}
707
708/// Sign-extend 16-bit integers to 32-bit integers.
709///
710/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi16_epi32)
711#[inline]
712#[target_feature(enable = "avx2")]
713#[cfg_attr(test, assert_instr(vpmovsxwd))]
714#[stable(feature = "simd_x86", since = "1.27.0")]
715pub fn _mm256_cvtepi16_epi32(a: __m128i) -> __m256i {
716    unsafe { transmute::<i32x8, _>(simd_cast(a.as_i16x8())) }
717}
718
719/// Sign-extend 16-bit integers to 64-bit integers.
720///
721/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi16_epi64)
722#[inline]
723#[target_feature(enable = "avx2")]
724#[cfg_attr(test, assert_instr(vpmovsxwq))]
725#[stable(feature = "simd_x86", since = "1.27.0")]
726pub fn _mm256_cvtepi16_epi64(a: __m128i) -> __m256i {
727    unsafe {
728        let a = a.as_i16x8();
729        let v64: i16x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
730        transmute::<i64x4, _>(simd_cast(v64))
731    }
732}
733
734/// Sign-extend 32-bit integers to 64-bit integers.
735///
736/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi32_epi64)
737#[inline]
738#[target_feature(enable = "avx2")]
739#[cfg_attr(test, assert_instr(vpmovsxdq))]
740#[stable(feature = "simd_x86", since = "1.27.0")]
741pub fn _mm256_cvtepi32_epi64(a: __m128i) -> __m256i {
742    unsafe { transmute::<i64x4, _>(simd_cast(a.as_i32x4())) }
743}
744
745/// Sign-extend 8-bit integers to 16-bit integers.
746///
747/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi16)
748#[inline]
749#[target_feature(enable = "avx2")]
750#[cfg_attr(test, assert_instr(vpmovsxbw))]
751#[stable(feature = "simd_x86", since = "1.27.0")]
752pub fn _mm256_cvtepi8_epi16(a: __m128i) -> __m256i {
753    unsafe { transmute::<i16x16, _>(simd_cast(a.as_i8x16())) }
754}
755
756/// Sign-extend 8-bit integers to 32-bit integers.
757///
758/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi32)
759#[inline]
760#[target_feature(enable = "avx2")]
761#[cfg_attr(test, assert_instr(vpmovsxbd))]
762#[stable(feature = "simd_x86", since = "1.27.0")]
763pub fn _mm256_cvtepi8_epi32(a: __m128i) -> __m256i {
764    unsafe {
765        let a = a.as_i8x16();
766        let v64: i8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
767        transmute::<i32x8, _>(simd_cast(v64))
768    }
769}
770
771/// Sign-extend 8-bit integers to 64-bit integers.
772///
773/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepi8_epi64)
774#[inline]
775#[target_feature(enable = "avx2")]
776#[cfg_attr(test, assert_instr(vpmovsxbq))]
777#[stable(feature = "simd_x86", since = "1.27.0")]
778pub fn _mm256_cvtepi8_epi64(a: __m128i) -> __m256i {
779    unsafe {
780        let a = a.as_i8x16();
781        let v32: i8x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
782        transmute::<i64x4, _>(simd_cast(v32))
783    }
784}
785
786/// Zeroes extend packed unsigned 16-bit integers in `a` to packed 32-bit
787/// integers, and stores the results in `dst`.
788///
789/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu16_epi32)
790#[inline]
791#[target_feature(enable = "avx2")]
792#[cfg_attr(test, assert_instr(vpmovzxwd))]
793#[stable(feature = "simd_x86", since = "1.27.0")]
794pub fn _mm256_cvtepu16_epi32(a: __m128i) -> __m256i {
795    unsafe { transmute::<i32x8, _>(simd_cast(a.as_u16x8())) }
796}
797
798/// Zero-extend the lower four unsigned 16-bit integers in `a` to 64-bit
799/// integers. The upper four elements of `a` are unused.
800///
801/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu16_epi64)
802#[inline]
803#[target_feature(enable = "avx2")]
804#[cfg_attr(test, assert_instr(vpmovzxwq))]
805#[stable(feature = "simd_x86", since = "1.27.0")]
806pub fn _mm256_cvtepu16_epi64(a: __m128i) -> __m256i {
807    unsafe {
808        let a = a.as_u16x8();
809        let v64: u16x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
810        transmute::<i64x4, _>(simd_cast(v64))
811    }
812}
813
814/// Zero-extend unsigned 32-bit integers in `a` to 64-bit integers.
815///
816/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu32_epi64)
817#[inline]
818#[target_feature(enable = "avx2")]
819#[cfg_attr(test, assert_instr(vpmovzxdq))]
820#[stable(feature = "simd_x86", since = "1.27.0")]
821pub fn _mm256_cvtepu32_epi64(a: __m128i) -> __m256i {
822    unsafe { transmute::<i64x4, _>(simd_cast(a.as_u32x4())) }
823}
824
825/// Zero-extend unsigned 8-bit integers in `a` to 16-bit integers.
826///
827/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi16)
828#[inline]
829#[target_feature(enable = "avx2")]
830#[cfg_attr(test, assert_instr(vpmovzxbw))]
831#[stable(feature = "simd_x86", since = "1.27.0")]
832pub fn _mm256_cvtepu8_epi16(a: __m128i) -> __m256i {
833    unsafe { transmute::<i16x16, _>(simd_cast(a.as_u8x16())) }
834}
835
836/// Zero-extend the lower eight unsigned 8-bit integers in `a` to 32-bit
837/// integers. The upper eight elements of `a` are unused.
838///
839/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi32)
840#[inline]
841#[target_feature(enable = "avx2")]
842#[cfg_attr(test, assert_instr(vpmovzxbd))]
843#[stable(feature = "simd_x86", since = "1.27.0")]
844pub fn _mm256_cvtepu8_epi32(a: __m128i) -> __m256i {
845    unsafe {
846        let a = a.as_u8x16();
847        let v64: u8x8 = simd_shuffle!(a, a, [0, 1, 2, 3, 4, 5, 6, 7]);
848        transmute::<i32x8, _>(simd_cast(v64))
849    }
850}
851
852/// Zero-extend the lower four unsigned 8-bit integers in `a` to 64-bit
853/// integers. The upper twelve elements of `a` are unused.
854///
855/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_cvtepu8_epi64)
856#[inline]
857#[target_feature(enable = "avx2")]
858#[cfg_attr(test, assert_instr(vpmovzxbq))]
859#[stable(feature = "simd_x86", since = "1.27.0")]
860pub fn _mm256_cvtepu8_epi64(a: __m128i) -> __m256i {
861    unsafe {
862        let a = a.as_u8x16();
863        let v32: u8x4 = simd_shuffle!(a, a, [0, 1, 2, 3]);
864        transmute::<i64x4, _>(simd_cast(v32))
865    }
866}
867
868/// Extracts 128 bits (of integer data) from `a` selected with `IMM1`.
869///
870/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extracti128_si256)
871#[inline]
872#[target_feature(enable = "avx2")]
873#[cfg_attr(test, assert_instr(vextractf128, IMM1 = 1))]
874#[rustc_legacy_const_generics(1)]
875#[stable(feature = "simd_x86", since = "1.27.0")]
876pub fn _mm256_extracti128_si256<const IMM1: i32>(a: __m256i) -> __m128i {
877    static_assert_uimm_bits!(IMM1, 1);
878    unsafe {
879        let a = a.as_i64x4();
880        let b = i64x4::ZERO;
881        let dst: i64x2 = simd_shuffle!(a, b, [[0, 1], [2, 3]][IMM1 as usize]);
882        transmute(dst)
883    }
884}
885
886/// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`.
887///
888/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_epi16)
889#[inline]
890#[target_feature(enable = "avx2")]
891#[cfg_attr(test, assert_instr(vphaddw))]
892#[stable(feature = "simd_x86", since = "1.27.0")]
893pub fn _mm256_hadd_epi16(a: __m256i, b: __m256i) -> __m256i {
894    let a = a.as_i16x16();
895    let b = b.as_i16x16();
896    unsafe {
897        let even: i16x16 = simd_shuffle!(
898            a,
899            b,
900            [0, 2, 4, 6, 16, 18, 20, 22, 8, 10, 12, 14, 24, 26, 28, 30]
901        );
902        let odd: i16x16 = simd_shuffle!(
903            a,
904            b,
905            [1, 3, 5, 7, 17, 19, 21, 23, 9, 11, 13, 15, 25, 27, 29, 31]
906        );
907        simd_add(even, odd).as_m256i()
908    }
909}
910
911/// Horizontally adds adjacent pairs of 32-bit integers in `a` and `b`.
912///
913/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadd_epi32)
914#[inline]
915#[target_feature(enable = "avx2")]
916#[cfg_attr(test, assert_instr(vphaddd))]
917#[stable(feature = "simd_x86", since = "1.27.0")]
918pub fn _mm256_hadd_epi32(a: __m256i, b: __m256i) -> __m256i {
919    let a = a.as_i32x8();
920    let b = b.as_i32x8();
921    unsafe {
922        let even: i32x8 = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]);
923        let odd: i32x8 = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]);
924        simd_add(even, odd).as_m256i()
925    }
926}
927
928/// Horizontally adds adjacent pairs of 16-bit integers in `a` and `b`
929/// using saturation.
930///
931/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hadds_epi16)
932#[inline]
933#[target_feature(enable = "avx2")]
934#[cfg_attr(test, assert_instr(vphaddsw))]
935#[stable(feature = "simd_x86", since = "1.27.0")]
936pub fn _mm256_hadds_epi16(a: __m256i, b: __m256i) -> __m256i {
937    unsafe { transmute(phaddsw(a.as_i16x16(), b.as_i16x16())) }
938}
939
940/// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`.
941///
942/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_epi16)
943#[inline]
944#[target_feature(enable = "avx2")]
945#[cfg_attr(test, assert_instr(vphsubw))]
946#[stable(feature = "simd_x86", since = "1.27.0")]
947pub fn _mm256_hsub_epi16(a: __m256i, b: __m256i) -> __m256i {
948    let a = a.as_i16x16();
949    let b = b.as_i16x16();
950    unsafe {
951        let even: i16x16 = simd_shuffle!(
952            a,
953            b,
954            [0, 2, 4, 6, 16, 18, 20, 22, 8, 10, 12, 14, 24, 26, 28, 30]
955        );
956        let odd: i16x16 = simd_shuffle!(
957            a,
958            b,
959            [1, 3, 5, 7, 17, 19, 21, 23, 9, 11, 13, 15, 25, 27, 29, 31]
960        );
961        simd_sub(even, odd).as_m256i()
962    }
963}
964
965/// Horizontally subtract adjacent pairs of 32-bit integers in `a` and `b`.
966///
967/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsub_epi32)
968#[inline]
969#[target_feature(enable = "avx2")]
970#[cfg_attr(test, assert_instr(vphsubd))]
971#[stable(feature = "simd_x86", since = "1.27.0")]
972pub fn _mm256_hsub_epi32(a: __m256i, b: __m256i) -> __m256i {
973    let a = a.as_i32x8();
974    let b = b.as_i32x8();
975    unsafe {
976        let even: i32x8 = simd_shuffle!(a, b, [0, 2, 8, 10, 4, 6, 12, 14]);
977        let odd: i32x8 = simd_shuffle!(a, b, [1, 3, 9, 11, 5, 7, 13, 15]);
978        simd_sub(even, odd).as_m256i()
979    }
980}
981
982/// Horizontally subtract adjacent pairs of 16-bit integers in `a` and `b`
983/// using saturation.
984///
985/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_hsubs_epi16)
986#[inline]
987#[target_feature(enable = "avx2")]
988#[cfg_attr(test, assert_instr(vphsubsw))]
989#[stable(feature = "simd_x86", since = "1.27.0")]
990pub fn _mm256_hsubs_epi16(a: __m256i, b: __m256i) -> __m256i {
991    unsafe { transmute(phsubsw(a.as_i16x16(), b.as_i16x16())) }
992}
993
994/// Returns values from `slice` at offsets determined by `offsets * scale`,
995/// where
996/// `scale` should be 1, 2, 4 or 8.
997///
998/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i32gather_epi32)
999#[inline]
1000#[target_feature(enable = "avx2")]
1001#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
1002#[rustc_legacy_const_generics(2)]
1003#[stable(feature = "simd_x86", since = "1.27.0")]
1004pub unsafe fn _mm_i32gather_epi32<const SCALE: i32>(
1005    slice: *const i32,
1006    offsets: __m128i,
1007) -> __m128i {
1008    static_assert_imm8_scale!(SCALE);
1009    let zero = i32x4::ZERO;
1010    let neg_one = _mm_set1_epi32(-1).as_i32x4();
1011    let offsets = offsets.as_i32x4();
1012    let slice = slice as *const i8;
1013    let r = pgatherdd(zero, slice, offsets, neg_one, SCALE as i8);
1014    transmute(r)
1015}
1016
1017/// Returns values from `slice` at offsets determined by `offsets * scale`,
1018/// where
1019/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1020/// that position instead.
1021///
1022/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i32gather_epi32)
1023#[inline]
1024#[target_feature(enable = "avx2")]
1025#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
1026#[rustc_legacy_const_generics(4)]
1027#[stable(feature = "simd_x86", since = "1.27.0")]
1028pub unsafe fn _mm_mask_i32gather_epi32<const SCALE: i32>(
1029    src: __m128i,
1030    slice: *const i32,
1031    offsets: __m128i,
1032    mask: __m128i,
1033) -> __m128i {
1034    static_assert_imm8_scale!(SCALE);
1035    let src = src.as_i32x4();
1036    let mask = mask.as_i32x4();
1037    let offsets = offsets.as_i32x4();
1038    let slice = slice as *const i8;
1039    let r = pgatherdd(src, slice, offsets, mask, SCALE as i8);
1040    transmute(r)
1041}
1042
1043/// Returns values from `slice` at offsets determined by `offsets * scale`,
1044/// where
1045/// `scale` should be 1, 2, 4 or 8.
1046///
1047/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32gather_epi32)
1048#[inline]
1049#[target_feature(enable = "avx2")]
1050#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
1051#[rustc_legacy_const_generics(2)]
1052#[stable(feature = "simd_x86", since = "1.27.0")]
1053pub unsafe fn _mm256_i32gather_epi32<const SCALE: i32>(
1054    slice: *const i32,
1055    offsets: __m256i,
1056) -> __m256i {
1057    static_assert_imm8_scale!(SCALE);
1058    let zero = i32x8::ZERO;
1059    let neg_one = _mm256_set1_epi32(-1).as_i32x8();
1060    let offsets = offsets.as_i32x8();
1061    let slice = slice as *const i8;
1062    let r = vpgatherdd(zero, slice, offsets, neg_one, SCALE as i8);
1063    transmute(r)
1064}
1065
1066/// Returns values from `slice` at offsets determined by `offsets * scale`,
1067/// where
1068/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1069/// that position instead.
1070///
1071/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i32gather_epi32)
1072#[inline]
1073#[target_feature(enable = "avx2")]
1074#[cfg_attr(test, assert_instr(vpgatherdd, SCALE = 1))]
1075#[rustc_legacy_const_generics(4)]
1076#[stable(feature = "simd_x86", since = "1.27.0")]
1077pub unsafe fn _mm256_mask_i32gather_epi32<const SCALE: i32>(
1078    src: __m256i,
1079    slice: *const i32,
1080    offsets: __m256i,
1081    mask: __m256i,
1082) -> __m256i {
1083    static_assert_imm8_scale!(SCALE);
1084    let src = src.as_i32x8();
1085    let mask = mask.as_i32x8();
1086    let offsets = offsets.as_i32x8();
1087    let slice = slice as *const i8;
1088    let r = vpgatherdd(src, slice, offsets, mask, SCALE as i8);
1089    transmute(r)
1090}
1091
1092/// Returns values from `slice` at offsets determined by `offsets * scale`,
1093/// where
1094/// `scale` should be 1, 2, 4 or 8.
1095///
1096/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i32gather_ps)
1097#[inline]
1098#[target_feature(enable = "avx2")]
1099#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
1100#[rustc_legacy_const_generics(2)]
1101#[stable(feature = "simd_x86", since = "1.27.0")]
1102pub unsafe fn _mm_i32gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m128i) -> __m128 {
1103    static_assert_imm8_scale!(SCALE);
1104    let zero = _mm_setzero_ps();
1105    let neg_one = _mm_set1_ps(-1.0);
1106    let offsets = offsets.as_i32x4();
1107    let slice = slice as *const i8;
1108    pgatherdps(zero, slice, offsets, neg_one, SCALE as i8)
1109}
1110
1111/// Returns values from `slice` at offsets determined by `offsets * scale`,
1112/// where
1113/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1114/// that position instead.
1115///
1116/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i32gather_ps)
1117#[inline]
1118#[target_feature(enable = "avx2")]
1119#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
1120#[rustc_legacy_const_generics(4)]
1121#[stable(feature = "simd_x86", since = "1.27.0")]
1122pub unsafe fn _mm_mask_i32gather_ps<const SCALE: i32>(
1123    src: __m128,
1124    slice: *const f32,
1125    offsets: __m128i,
1126    mask: __m128,
1127) -> __m128 {
1128    static_assert_imm8_scale!(SCALE);
1129    let offsets = offsets.as_i32x4();
1130    let slice = slice as *const i8;
1131    pgatherdps(src, slice, offsets, mask, SCALE as i8)
1132}
1133
1134/// Returns values from `slice` at offsets determined by `offsets * scale`,
1135/// where
1136/// `scale` should be 1, 2, 4 or 8.
1137///
1138/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32gather_ps)
1139#[inline]
1140#[target_feature(enable = "avx2")]
1141#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
1142#[rustc_legacy_const_generics(2)]
1143#[stable(feature = "simd_x86", since = "1.27.0")]
1144pub unsafe fn _mm256_i32gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m256i) -> __m256 {
1145    static_assert_imm8_scale!(SCALE);
1146    let zero = _mm256_setzero_ps();
1147    let neg_one = _mm256_set1_ps(-1.0);
1148    let offsets = offsets.as_i32x8();
1149    let slice = slice as *const i8;
1150    vpgatherdps(zero, slice, offsets, neg_one, SCALE as i8)
1151}
1152
1153/// Returns values from `slice` at offsets determined by `offsets * scale`,
1154/// where
1155/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1156/// that position instead.
1157///
1158/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i32gather_ps)
1159#[inline]
1160#[target_feature(enable = "avx2")]
1161#[cfg_attr(test, assert_instr(vgatherdps, SCALE = 1))]
1162#[rustc_legacy_const_generics(4)]
1163#[stable(feature = "simd_x86", since = "1.27.0")]
1164pub unsafe fn _mm256_mask_i32gather_ps<const SCALE: i32>(
1165    src: __m256,
1166    slice: *const f32,
1167    offsets: __m256i,
1168    mask: __m256,
1169) -> __m256 {
1170    static_assert_imm8_scale!(SCALE);
1171    let offsets = offsets.as_i32x8();
1172    let slice = slice as *const i8;
1173    vpgatherdps(src, slice, offsets, mask, SCALE as i8)
1174}
1175
1176/// Returns values from `slice` at offsets determined by `offsets * scale`,
1177/// where
1178/// `scale` should be 1, 2, 4 or 8.
1179///
1180/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i32gather_epi64)
1181#[inline]
1182#[target_feature(enable = "avx2")]
1183#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
1184#[rustc_legacy_const_generics(2)]
1185#[stable(feature = "simd_x86", since = "1.27.0")]
1186pub unsafe fn _mm_i32gather_epi64<const SCALE: i32>(
1187    slice: *const i64,
1188    offsets: __m128i,
1189) -> __m128i {
1190    static_assert_imm8_scale!(SCALE);
1191    let zero = i64x2::ZERO;
1192    let neg_one = _mm_set1_epi64x(-1).as_i64x2();
1193    let offsets = offsets.as_i32x4();
1194    let slice = slice as *const i8;
1195    let r = pgatherdq(zero, slice, offsets, neg_one, SCALE as i8);
1196    transmute(r)
1197}
1198
1199/// Returns values from `slice` at offsets determined by `offsets * scale`,
1200/// where
1201/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1202/// that position instead.
1203///
1204/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i32gather_epi64)
1205#[inline]
1206#[target_feature(enable = "avx2")]
1207#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
1208#[rustc_legacy_const_generics(4)]
1209#[stable(feature = "simd_x86", since = "1.27.0")]
1210pub unsafe fn _mm_mask_i32gather_epi64<const SCALE: i32>(
1211    src: __m128i,
1212    slice: *const i64,
1213    offsets: __m128i,
1214    mask: __m128i,
1215) -> __m128i {
1216    static_assert_imm8_scale!(SCALE);
1217    let src = src.as_i64x2();
1218    let mask = mask.as_i64x2();
1219    let offsets = offsets.as_i32x4();
1220    let slice = slice as *const i8;
1221    let r = pgatherdq(src, slice, offsets, mask, SCALE as i8);
1222    transmute(r)
1223}
1224
1225/// Returns values from `slice` at offsets determined by `offsets * scale`,
1226/// where
1227/// `scale` should be 1, 2, 4 or 8.
1228///
1229/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32gather_epi64)
1230#[inline]
1231#[target_feature(enable = "avx2")]
1232#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
1233#[rustc_legacy_const_generics(2)]
1234#[stable(feature = "simd_x86", since = "1.27.0")]
1235pub unsafe fn _mm256_i32gather_epi64<const SCALE: i32>(
1236    slice: *const i64,
1237    offsets: __m128i,
1238) -> __m256i {
1239    static_assert_imm8_scale!(SCALE);
1240    let zero = i64x4::ZERO;
1241    let neg_one = _mm256_set1_epi64x(-1).as_i64x4();
1242    let offsets = offsets.as_i32x4();
1243    let slice = slice as *const i8;
1244    let r = vpgatherdq(zero, slice, offsets, neg_one, SCALE as i8);
1245    transmute(r)
1246}
1247
1248/// Returns values from `slice` at offsets determined by `offsets * scale`,
1249/// where
1250/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1251/// that position instead.
1252///
1253/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i32gather_epi64)
1254#[inline]
1255#[target_feature(enable = "avx2")]
1256#[cfg_attr(test, assert_instr(vpgatherdq, SCALE = 1))]
1257#[rustc_legacy_const_generics(4)]
1258#[stable(feature = "simd_x86", since = "1.27.0")]
1259pub unsafe fn _mm256_mask_i32gather_epi64<const SCALE: i32>(
1260    src: __m256i,
1261    slice: *const i64,
1262    offsets: __m128i,
1263    mask: __m256i,
1264) -> __m256i {
1265    static_assert_imm8_scale!(SCALE);
1266    let src = src.as_i64x4();
1267    let mask = mask.as_i64x4();
1268    let offsets = offsets.as_i32x4();
1269    let slice = slice as *const i8;
1270    let r = vpgatherdq(src, slice, offsets, mask, SCALE as i8);
1271    transmute(r)
1272}
1273
1274/// Returns values from `slice` at offsets determined by `offsets * scale`,
1275/// where
1276/// `scale` should be 1, 2, 4 or 8.
1277///
1278/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i32gather_pd)
1279#[inline]
1280#[target_feature(enable = "avx2")]
1281#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
1282#[rustc_legacy_const_generics(2)]
1283#[stable(feature = "simd_x86", since = "1.27.0")]
1284pub unsafe fn _mm_i32gather_pd<const SCALE: i32>(slice: *const f64, offsets: __m128i) -> __m128d {
1285    static_assert_imm8_scale!(SCALE);
1286    let zero = _mm_setzero_pd();
1287    let neg_one = _mm_set1_pd(-1.0);
1288    let offsets = offsets.as_i32x4();
1289    let slice = slice as *const i8;
1290    pgatherdpd(zero, slice, offsets, neg_one, SCALE as i8)
1291}
1292
1293/// Returns values from `slice` at offsets determined by `offsets * scale`,
1294/// where
1295/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1296/// that position instead.
1297///
1298/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i32gather_pd)
1299#[inline]
1300#[target_feature(enable = "avx2")]
1301#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
1302#[rustc_legacy_const_generics(4)]
1303#[stable(feature = "simd_x86", since = "1.27.0")]
1304pub unsafe fn _mm_mask_i32gather_pd<const SCALE: i32>(
1305    src: __m128d,
1306    slice: *const f64,
1307    offsets: __m128i,
1308    mask: __m128d,
1309) -> __m128d {
1310    static_assert_imm8_scale!(SCALE);
1311    let offsets = offsets.as_i32x4();
1312    let slice = slice as *const i8;
1313    pgatherdpd(src, slice, offsets, mask, SCALE as i8)
1314}
1315
1316/// Returns values from `slice` at offsets determined by `offsets * scale`,
1317/// where
1318/// `scale` should be 1, 2, 4 or 8.
1319///
1320/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i32gather_pd)
1321#[inline]
1322#[target_feature(enable = "avx2")]
1323#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
1324#[rustc_legacy_const_generics(2)]
1325#[stable(feature = "simd_x86", since = "1.27.0")]
1326pub unsafe fn _mm256_i32gather_pd<const SCALE: i32>(
1327    slice: *const f64,
1328    offsets: __m128i,
1329) -> __m256d {
1330    static_assert_imm8_scale!(SCALE);
1331    let zero = _mm256_setzero_pd();
1332    let neg_one = _mm256_set1_pd(-1.0);
1333    let offsets = offsets.as_i32x4();
1334    let slice = slice as *const i8;
1335    vpgatherdpd(zero, slice, offsets, neg_one, SCALE as i8)
1336}
1337
1338/// Returns values from `slice` at offsets determined by `offsets * scale`,
1339/// where
1340/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1341/// that position instead.
1342///
1343/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i32gather_pd)
1344#[inline]
1345#[target_feature(enable = "avx2")]
1346#[cfg_attr(test, assert_instr(vgatherdpd, SCALE = 1))]
1347#[rustc_legacy_const_generics(4)]
1348#[stable(feature = "simd_x86", since = "1.27.0")]
1349pub unsafe fn _mm256_mask_i32gather_pd<const SCALE: i32>(
1350    src: __m256d,
1351    slice: *const f64,
1352    offsets: __m128i,
1353    mask: __m256d,
1354) -> __m256d {
1355    static_assert_imm8_scale!(SCALE);
1356    let offsets = offsets.as_i32x4();
1357    let slice = slice as *const i8;
1358    vpgatherdpd(src, slice, offsets, mask, SCALE as i8)
1359}
1360
1361/// Returns values from `slice` at offsets determined by `offsets * scale`,
1362/// where
1363/// `scale` should be 1, 2, 4 or 8.
1364///
1365/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_epi32)
1366#[inline]
1367#[target_feature(enable = "avx2")]
1368#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
1369#[rustc_legacy_const_generics(2)]
1370#[stable(feature = "simd_x86", since = "1.27.0")]
1371pub unsafe fn _mm_i64gather_epi32<const SCALE: i32>(
1372    slice: *const i32,
1373    offsets: __m128i,
1374) -> __m128i {
1375    static_assert_imm8_scale!(SCALE);
1376    let zero = i32x4::ZERO;
1377    let neg_one = _mm_set1_epi64x(-1).as_i32x4();
1378    let offsets = offsets.as_i64x2();
1379    let slice = slice as *const i8;
1380    let r = pgatherqd(zero, slice, offsets, neg_one, SCALE as i8);
1381    transmute(r)
1382}
1383
1384/// Returns values from `slice` at offsets determined by `offsets * scale`,
1385/// where
1386/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1387/// that position instead.
1388///
1389/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_epi32)
1390#[inline]
1391#[target_feature(enable = "avx2")]
1392#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
1393#[rustc_legacy_const_generics(4)]
1394#[stable(feature = "simd_x86", since = "1.27.0")]
1395pub unsafe fn _mm_mask_i64gather_epi32<const SCALE: i32>(
1396    src: __m128i,
1397    slice: *const i32,
1398    offsets: __m128i,
1399    mask: __m128i,
1400) -> __m128i {
1401    static_assert_imm8_scale!(SCALE);
1402    let src = src.as_i32x4();
1403    let mask = mask.as_i32x4();
1404    let offsets = offsets.as_i64x2();
1405    let slice = slice as *const i8;
1406    let r = pgatherqd(src, slice, offsets, mask, SCALE as i8);
1407    transmute(r)
1408}
1409
1410/// Returns values from `slice` at offsets determined by `offsets * scale`,
1411/// where
1412/// `scale` should be 1, 2, 4 or 8.
1413///
1414/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_epi32)
1415#[inline]
1416#[target_feature(enable = "avx2")]
1417#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
1418#[rustc_legacy_const_generics(2)]
1419#[stable(feature = "simd_x86", since = "1.27.0")]
1420pub unsafe fn _mm256_i64gather_epi32<const SCALE: i32>(
1421    slice: *const i32,
1422    offsets: __m256i,
1423) -> __m128i {
1424    static_assert_imm8_scale!(SCALE);
1425    let zero = i32x4::ZERO;
1426    let neg_one = _mm_set1_epi64x(-1).as_i32x4();
1427    let offsets = offsets.as_i64x4();
1428    let slice = slice as *const i8;
1429    let r = vpgatherqd(zero, slice, offsets, neg_one, SCALE as i8);
1430    transmute(r)
1431}
1432
1433/// Returns values from `slice` at offsets determined by `offsets * scale`,
1434/// where
1435/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1436/// that position instead.
1437///
1438/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_epi32)
1439#[inline]
1440#[target_feature(enable = "avx2")]
1441#[cfg_attr(test, assert_instr(vpgatherqd, SCALE = 1))]
1442#[rustc_legacy_const_generics(4)]
1443#[stable(feature = "simd_x86", since = "1.27.0")]
1444pub unsafe fn _mm256_mask_i64gather_epi32<const SCALE: i32>(
1445    src: __m128i,
1446    slice: *const i32,
1447    offsets: __m256i,
1448    mask: __m128i,
1449) -> __m128i {
1450    static_assert_imm8_scale!(SCALE);
1451    let src = src.as_i32x4();
1452    let mask = mask.as_i32x4();
1453    let offsets = offsets.as_i64x4();
1454    let slice = slice as *const i8;
1455    let r = vpgatherqd(src, slice, offsets, mask, SCALE as i8);
1456    transmute(r)
1457}
1458
1459/// Returns values from `slice` at offsets determined by `offsets * scale`,
1460/// where
1461/// `scale` should be 1, 2, 4 or 8.
1462///
1463/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_ps)
1464#[inline]
1465#[target_feature(enable = "avx2")]
1466#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
1467#[rustc_legacy_const_generics(2)]
1468#[stable(feature = "simd_x86", since = "1.27.0")]
1469pub unsafe fn _mm_i64gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m128i) -> __m128 {
1470    static_assert_imm8_scale!(SCALE);
1471    let zero = _mm_setzero_ps();
1472    let neg_one = _mm_set1_ps(-1.0);
1473    let offsets = offsets.as_i64x2();
1474    let slice = slice as *const i8;
1475    pgatherqps(zero, slice, offsets, neg_one, SCALE as i8)
1476}
1477
1478/// Returns values from `slice` at offsets determined by `offsets * scale`,
1479/// where
1480/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1481/// that position instead.
1482///
1483/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_ps)
1484#[inline]
1485#[target_feature(enable = "avx2")]
1486#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
1487#[rustc_legacy_const_generics(4)]
1488#[stable(feature = "simd_x86", since = "1.27.0")]
1489pub unsafe fn _mm_mask_i64gather_ps<const SCALE: i32>(
1490    src: __m128,
1491    slice: *const f32,
1492    offsets: __m128i,
1493    mask: __m128,
1494) -> __m128 {
1495    static_assert_imm8_scale!(SCALE);
1496    let offsets = offsets.as_i64x2();
1497    let slice = slice as *const i8;
1498    pgatherqps(src, slice, offsets, mask, SCALE as i8)
1499}
1500
1501/// Returns values from `slice` at offsets determined by `offsets * scale`,
1502/// where
1503/// `scale` should be 1, 2, 4 or 8.
1504///
1505/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_ps)
1506#[inline]
1507#[target_feature(enable = "avx2")]
1508#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
1509#[rustc_legacy_const_generics(2)]
1510#[stable(feature = "simd_x86", since = "1.27.0")]
1511pub unsafe fn _mm256_i64gather_ps<const SCALE: i32>(slice: *const f32, offsets: __m256i) -> __m128 {
1512    static_assert_imm8_scale!(SCALE);
1513    let zero = _mm_setzero_ps();
1514    let neg_one = _mm_set1_ps(-1.0);
1515    let offsets = offsets.as_i64x4();
1516    let slice = slice as *const i8;
1517    vpgatherqps(zero, slice, offsets, neg_one, SCALE as i8)
1518}
1519
1520/// Returns values from `slice` at offsets determined by `offsets * scale`,
1521/// where
1522/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1523/// that position instead.
1524///
1525/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_ps)
1526#[inline]
1527#[target_feature(enable = "avx2")]
1528#[cfg_attr(test, assert_instr(vgatherqps, SCALE = 1))]
1529#[rustc_legacy_const_generics(4)]
1530#[stable(feature = "simd_x86", since = "1.27.0")]
1531pub unsafe fn _mm256_mask_i64gather_ps<const SCALE: i32>(
1532    src: __m128,
1533    slice: *const f32,
1534    offsets: __m256i,
1535    mask: __m128,
1536) -> __m128 {
1537    static_assert_imm8_scale!(SCALE);
1538    let offsets = offsets.as_i64x4();
1539    let slice = slice as *const i8;
1540    vpgatherqps(src, slice, offsets, mask, SCALE as i8)
1541}
1542
1543/// Returns values from `slice` at offsets determined by `offsets * scale`,
1544/// where
1545/// `scale` should be 1, 2, 4 or 8.
1546///
1547/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_epi64)
1548#[inline]
1549#[target_feature(enable = "avx2")]
1550#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
1551#[rustc_legacy_const_generics(2)]
1552#[stable(feature = "simd_x86", since = "1.27.0")]
1553pub unsafe fn _mm_i64gather_epi64<const SCALE: i32>(
1554    slice: *const i64,
1555    offsets: __m128i,
1556) -> __m128i {
1557    static_assert_imm8_scale!(SCALE);
1558    let zero = i64x2::ZERO;
1559    let neg_one = _mm_set1_epi64x(-1).as_i64x2();
1560    let slice = slice as *const i8;
1561    let offsets = offsets.as_i64x2();
1562    let r = pgatherqq(zero, slice, offsets, neg_one, SCALE as i8);
1563    transmute(r)
1564}
1565
1566/// Returns values from `slice` at offsets determined by `offsets * scale`,
1567/// where
1568/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1569/// that position instead.
1570///
1571/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_epi64)
1572#[inline]
1573#[target_feature(enable = "avx2")]
1574#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
1575#[rustc_legacy_const_generics(4)]
1576#[stable(feature = "simd_x86", since = "1.27.0")]
1577pub unsafe fn _mm_mask_i64gather_epi64<const SCALE: i32>(
1578    src: __m128i,
1579    slice: *const i64,
1580    offsets: __m128i,
1581    mask: __m128i,
1582) -> __m128i {
1583    static_assert_imm8_scale!(SCALE);
1584    let src = src.as_i64x2();
1585    let mask = mask.as_i64x2();
1586    let offsets = offsets.as_i64x2();
1587    let slice = slice as *const i8;
1588    let r = pgatherqq(src, slice, offsets, mask, SCALE as i8);
1589    transmute(r)
1590}
1591
1592/// Returns values from `slice` at offsets determined by `offsets * scale`,
1593/// where
1594/// `scale` should be 1, 2, 4 or 8.
1595///
1596/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_epi64)
1597#[inline]
1598#[target_feature(enable = "avx2")]
1599#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
1600#[rustc_legacy_const_generics(2)]
1601#[stable(feature = "simd_x86", since = "1.27.0")]
1602pub unsafe fn _mm256_i64gather_epi64<const SCALE: i32>(
1603    slice: *const i64,
1604    offsets: __m256i,
1605) -> __m256i {
1606    static_assert_imm8_scale!(SCALE);
1607    let zero = i64x4::ZERO;
1608    let neg_one = _mm256_set1_epi64x(-1).as_i64x4();
1609    let slice = slice as *const i8;
1610    let offsets = offsets.as_i64x4();
1611    let r = vpgatherqq(zero, slice, offsets, neg_one, SCALE as i8);
1612    transmute(r)
1613}
1614
1615/// Returns values from `slice` at offsets determined by `offsets * scale`,
1616/// where
1617/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1618/// that position instead.
1619///
1620/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_epi64)
1621#[inline]
1622#[target_feature(enable = "avx2")]
1623#[cfg_attr(test, assert_instr(vpgatherqq, SCALE = 1))]
1624#[rustc_legacy_const_generics(4)]
1625#[stable(feature = "simd_x86", since = "1.27.0")]
1626pub unsafe fn _mm256_mask_i64gather_epi64<const SCALE: i32>(
1627    src: __m256i,
1628    slice: *const i64,
1629    offsets: __m256i,
1630    mask: __m256i,
1631) -> __m256i {
1632    static_assert_imm8_scale!(SCALE);
1633    let src = src.as_i64x4();
1634    let mask = mask.as_i64x4();
1635    let offsets = offsets.as_i64x4();
1636    let slice = slice as *const i8;
1637    let r = vpgatherqq(src, slice, offsets, mask, SCALE as i8);
1638    transmute(r)
1639}
1640
1641/// Returns values from `slice` at offsets determined by `offsets * scale`,
1642/// where
1643/// `scale` should be 1, 2, 4 or 8.
1644///
1645/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_i64gather_pd)
1646#[inline]
1647#[target_feature(enable = "avx2")]
1648#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
1649#[rustc_legacy_const_generics(2)]
1650#[stable(feature = "simd_x86", since = "1.27.0")]
1651pub unsafe fn _mm_i64gather_pd<const SCALE: i32>(slice: *const f64, offsets: __m128i) -> __m128d {
1652    static_assert_imm8_scale!(SCALE);
1653    let zero = _mm_setzero_pd();
1654    let neg_one = _mm_set1_pd(-1.0);
1655    let slice = slice as *const i8;
1656    let offsets = offsets.as_i64x2();
1657    pgatherqpd(zero, slice, offsets, neg_one, SCALE as i8)
1658}
1659
1660/// Returns values from `slice` at offsets determined by `offsets * scale`,
1661/// where
1662/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1663/// that position instead.
1664///
1665/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mask_i64gather_pd)
1666#[inline]
1667#[target_feature(enable = "avx2")]
1668#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
1669#[rustc_legacy_const_generics(4)]
1670#[stable(feature = "simd_x86", since = "1.27.0")]
1671pub unsafe fn _mm_mask_i64gather_pd<const SCALE: i32>(
1672    src: __m128d,
1673    slice: *const f64,
1674    offsets: __m128i,
1675    mask: __m128d,
1676) -> __m128d {
1677    static_assert_imm8_scale!(SCALE);
1678    let slice = slice as *const i8;
1679    let offsets = offsets.as_i64x2();
1680    pgatherqpd(src, slice, offsets, mask, SCALE as i8)
1681}
1682
1683/// Returns values from `slice` at offsets determined by `offsets * scale`,
1684/// where
1685/// `scale` should be 1, 2, 4 or 8.
1686///
1687/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_i64gather_pd)
1688#[inline]
1689#[target_feature(enable = "avx2")]
1690#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
1691#[rustc_legacy_const_generics(2)]
1692#[stable(feature = "simd_x86", since = "1.27.0")]
1693pub unsafe fn _mm256_i64gather_pd<const SCALE: i32>(
1694    slice: *const f64,
1695    offsets: __m256i,
1696) -> __m256d {
1697    static_assert_imm8_scale!(SCALE);
1698    let zero = _mm256_setzero_pd();
1699    let neg_one = _mm256_set1_pd(-1.0);
1700    let slice = slice as *const i8;
1701    let offsets = offsets.as_i64x4();
1702    vpgatherqpd(zero, slice, offsets, neg_one, SCALE as i8)
1703}
1704
1705/// Returns values from `slice` at offsets determined by `offsets * scale`,
1706/// where
1707/// `scale` should be 1, 2, 4 or 8. If mask is set, load the value from `src` in
1708/// that position instead.
1709///
1710/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mask_i64gather_pd)
1711#[inline]
1712#[target_feature(enable = "avx2")]
1713#[cfg_attr(test, assert_instr(vgatherqpd, SCALE = 1))]
1714#[rustc_legacy_const_generics(4)]
1715#[stable(feature = "simd_x86", since = "1.27.0")]
1716pub unsafe fn _mm256_mask_i64gather_pd<const SCALE: i32>(
1717    src: __m256d,
1718    slice: *const f64,
1719    offsets: __m256i,
1720    mask: __m256d,
1721) -> __m256d {
1722    static_assert_imm8_scale!(SCALE);
1723    let slice = slice as *const i8;
1724    let offsets = offsets.as_i64x4();
1725    vpgatherqpd(src, slice, offsets, mask, SCALE as i8)
1726}
1727
1728/// Copies `a` to `dst`, then insert 128 bits (of integer data) from `b` at the
1729/// location specified by `IMM1`.
1730///
1731/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_inserti128_si256)
1732#[inline]
1733#[target_feature(enable = "avx2")]
1734#[cfg_attr(test, assert_instr(vinsertf128, IMM1 = 1))]
1735#[rustc_legacy_const_generics(2)]
1736#[stable(feature = "simd_x86", since = "1.27.0")]
1737pub fn _mm256_inserti128_si256<const IMM1: i32>(a: __m256i, b: __m128i) -> __m256i {
1738    static_assert_uimm_bits!(IMM1, 1);
1739    unsafe {
1740        let a = a.as_i64x4();
1741        let b = _mm256_castsi128_si256(b).as_i64x4();
1742        let dst: i64x4 = simd_shuffle!(a, b, [[4, 5, 2, 3], [0, 1, 4, 5]][IMM1 as usize]);
1743        transmute(dst)
1744    }
1745}
1746
1747/// Multiplies packed signed 16-bit integers in `a` and `b`, producing
1748/// intermediate signed 32-bit integers. Horizontally add adjacent pairs
1749/// of intermediate 32-bit integers.
1750///
1751/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_madd_epi16)
1752#[inline]
1753#[target_feature(enable = "avx2")]
1754#[cfg_attr(test, assert_instr(vpmaddwd))]
1755#[stable(feature = "simd_x86", since = "1.27.0")]
1756pub fn _mm256_madd_epi16(a: __m256i, b: __m256i) -> __m256i {
1757    // It's a trick used in the Adler-32 algorithm to perform a widening addition.
1758    //
1759    // ```rust
1760    // #[target_feature(enable = "avx2")]
1761    // unsafe fn widening_add(mad: __m256i) -> __m256i {
1762    //     _mm256_madd_epi16(mad, _mm256_set1_epi16(1))
1763    // }
1764    // ```
1765    //
1766    // If we implement this using generic vector intrinsics, the optimizer
1767    // will eliminate this pattern, and `vpmaddwd` will no longer be emitted.
1768    // For this reason, we use x86 intrinsics.
1769    unsafe { transmute(pmaddwd(a.as_i16x16(), b.as_i16x16())) }
1770}
1771
1772/// Vertically multiplies each unsigned 8-bit integer from `a` with the
1773/// corresponding signed 8-bit integer from `b`, producing intermediate
1774/// signed 16-bit integers. Horizontally add adjacent pairs of intermediate
1775/// signed 16-bit integers
1776///
1777/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maddubs_epi16)
1778#[inline]
1779#[target_feature(enable = "avx2")]
1780#[cfg_attr(test, assert_instr(vpmaddubsw))]
1781#[stable(feature = "simd_x86", since = "1.27.0")]
1782pub fn _mm256_maddubs_epi16(a: __m256i, b: __m256i) -> __m256i {
1783    unsafe { transmute(pmaddubsw(a.as_u8x32(), b.as_i8x32())) }
1784}
1785
1786/// Loads packed 32-bit integers from memory pointed by `mem_addr` using `mask`
1787/// (elements are zeroed out when the highest bit is not set in the
1788/// corresponding element).
1789///
1790/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskload_epi32)
1791#[inline]
1792#[target_feature(enable = "avx2")]
1793#[cfg_attr(test, assert_instr(vpmaskmovd))]
1794#[stable(feature = "simd_x86", since = "1.27.0")]
1795pub unsafe fn _mm_maskload_epi32(mem_addr: *const i32, mask: __m128i) -> __m128i {
1796    let mask = simd_shr(mask.as_i32x4(), i32x4::splat(31));
1797    simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, i32x4::ZERO).as_m128i()
1798}
1799
1800/// Loads packed 32-bit integers from memory pointed by `mem_addr` using `mask`
1801/// (elements are zeroed out when the highest bit is not set in the
1802/// corresponding element).
1803///
1804/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskload_epi32)
1805#[inline]
1806#[target_feature(enable = "avx2")]
1807#[cfg_attr(test, assert_instr(vpmaskmovd))]
1808#[stable(feature = "simd_x86", since = "1.27.0")]
1809pub unsafe fn _mm256_maskload_epi32(mem_addr: *const i32, mask: __m256i) -> __m256i {
1810    let mask = simd_shr(mask.as_i32x8(), i32x8::splat(31));
1811    simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, i32x8::ZERO).as_m256i()
1812}
1813
1814/// Loads packed 64-bit integers from memory pointed by `mem_addr` using `mask`
1815/// (elements are zeroed out when the highest bit is not set in the
1816/// corresponding element).
1817///
1818/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskload_epi64)
1819#[inline]
1820#[target_feature(enable = "avx2")]
1821#[cfg_attr(test, assert_instr(vpmaskmovq))]
1822#[stable(feature = "simd_x86", since = "1.27.0")]
1823pub unsafe fn _mm_maskload_epi64(mem_addr: *const i64, mask: __m128i) -> __m128i {
1824    let mask = simd_shr(mask.as_i64x2(), i64x2::splat(63));
1825    simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, i64x2::ZERO).as_m128i()
1826}
1827
1828/// Loads packed 64-bit integers from memory pointed by `mem_addr` using `mask`
1829/// (elements are zeroed out when the highest bit is not set in the
1830/// corresponding element).
1831///
1832/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskload_epi64)
1833#[inline]
1834#[target_feature(enable = "avx2")]
1835#[cfg_attr(test, assert_instr(vpmaskmovq))]
1836#[stable(feature = "simd_x86", since = "1.27.0")]
1837pub unsafe fn _mm256_maskload_epi64(mem_addr: *const i64, mask: __m256i) -> __m256i {
1838    let mask = simd_shr(mask.as_i64x4(), i64x4::splat(63));
1839    simd_masked_load!(SimdAlign::Unaligned, mask, mem_addr, i64x4::ZERO).as_m256i()
1840}
1841
1842/// Stores packed 32-bit integers from `a` into memory pointed by `mem_addr`
1843/// using `mask` (elements are not stored when the highest bit is not set
1844/// in the corresponding element).
1845///
1846/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskstore_epi32)
1847#[inline]
1848#[target_feature(enable = "avx2")]
1849#[cfg_attr(test, assert_instr(vpmaskmovd))]
1850#[stable(feature = "simd_x86", since = "1.27.0")]
1851pub unsafe fn _mm_maskstore_epi32(mem_addr: *mut i32, mask: __m128i, a: __m128i) {
1852    let mask = simd_shr(mask.as_i32x4(), i32x4::splat(31));
1853    simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a.as_i32x4())
1854}
1855
1856/// Stores packed 32-bit integers from `a` into memory pointed by `mem_addr`
1857/// using `mask` (elements are not stored when the highest bit is not set
1858/// in the corresponding element).
1859///
1860/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskstore_epi32)
1861#[inline]
1862#[target_feature(enable = "avx2")]
1863#[cfg_attr(test, assert_instr(vpmaskmovd))]
1864#[stable(feature = "simd_x86", since = "1.27.0")]
1865pub unsafe fn _mm256_maskstore_epi32(mem_addr: *mut i32, mask: __m256i, a: __m256i) {
1866    let mask = simd_shr(mask.as_i32x8(), i32x8::splat(31));
1867    simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a.as_i32x8())
1868}
1869
1870/// Stores packed 64-bit integers from `a` into memory pointed by `mem_addr`
1871/// using `mask` (elements are not stored when the highest bit is not set
1872/// in the corresponding element).
1873///
1874/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskstore_epi64)
1875#[inline]
1876#[target_feature(enable = "avx2")]
1877#[cfg_attr(test, assert_instr(vpmaskmovq))]
1878#[stable(feature = "simd_x86", since = "1.27.0")]
1879pub unsafe fn _mm_maskstore_epi64(mem_addr: *mut i64, mask: __m128i, a: __m128i) {
1880    let mask = simd_shr(mask.as_i64x2(), i64x2::splat(63));
1881    simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a.as_i64x2())
1882}
1883
1884/// Stores packed 64-bit integers from `a` into memory pointed by `mem_addr`
1885/// using `mask` (elements are not stored when the highest bit is not set
1886/// in the corresponding element).
1887///
1888/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_maskstore_epi64)
1889#[inline]
1890#[target_feature(enable = "avx2")]
1891#[cfg_attr(test, assert_instr(vpmaskmovq))]
1892#[stable(feature = "simd_x86", since = "1.27.0")]
1893pub unsafe fn _mm256_maskstore_epi64(mem_addr: *mut i64, mask: __m256i, a: __m256i) {
1894    let mask = simd_shr(mask.as_i64x4(), i64x4::splat(63));
1895    simd_masked_store!(SimdAlign::Unaligned, mask, mem_addr, a.as_i64x4())
1896}
1897
1898/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
1899/// maximum values.
1900///
1901/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi16)
1902#[inline]
1903#[target_feature(enable = "avx2")]
1904#[cfg_attr(test, assert_instr(vpmaxsw))]
1905#[stable(feature = "simd_x86", since = "1.27.0")]
1906pub fn _mm256_max_epi16(a: __m256i, b: __m256i) -> __m256i {
1907    unsafe {
1908        let a = a.as_i16x16();
1909        let b = b.as_i16x16();
1910        transmute(simd_select::<i16x16, _>(simd_gt(a, b), a, b))
1911    }
1912}
1913
1914/// Compares packed 32-bit integers in `a` and `b`, and returns the packed
1915/// maximum values.
1916///
1917/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi32)
1918#[inline]
1919#[target_feature(enable = "avx2")]
1920#[cfg_attr(test, assert_instr(vpmaxsd))]
1921#[stable(feature = "simd_x86", since = "1.27.0")]
1922pub fn _mm256_max_epi32(a: __m256i, b: __m256i) -> __m256i {
1923    unsafe {
1924        let a = a.as_i32x8();
1925        let b = b.as_i32x8();
1926        transmute(simd_select::<i32x8, _>(simd_gt(a, b), a, b))
1927    }
1928}
1929
1930/// Compares packed 8-bit integers in `a` and `b`, and returns the packed
1931/// maximum values.
1932///
1933/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epi8)
1934#[inline]
1935#[target_feature(enable = "avx2")]
1936#[cfg_attr(test, assert_instr(vpmaxsb))]
1937#[stable(feature = "simd_x86", since = "1.27.0")]
1938pub fn _mm256_max_epi8(a: __m256i, b: __m256i) -> __m256i {
1939    unsafe {
1940        let a = a.as_i8x32();
1941        let b = b.as_i8x32();
1942        transmute(simd_select::<i8x32, _>(simd_gt(a, b), a, b))
1943    }
1944}
1945
1946/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
1947/// the packed maximum values.
1948///
1949/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu16)
1950#[inline]
1951#[target_feature(enable = "avx2")]
1952#[cfg_attr(test, assert_instr(vpmaxuw))]
1953#[stable(feature = "simd_x86", since = "1.27.0")]
1954pub fn _mm256_max_epu16(a: __m256i, b: __m256i) -> __m256i {
1955    unsafe {
1956        let a = a.as_u16x16();
1957        let b = b.as_u16x16();
1958        transmute(simd_select::<i16x16, _>(simd_gt(a, b), a, b))
1959    }
1960}
1961
1962/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
1963/// the packed maximum values.
1964///
1965/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu32)
1966#[inline]
1967#[target_feature(enable = "avx2")]
1968#[cfg_attr(test, assert_instr(vpmaxud))]
1969#[stable(feature = "simd_x86", since = "1.27.0")]
1970pub fn _mm256_max_epu32(a: __m256i, b: __m256i) -> __m256i {
1971    unsafe {
1972        let a = a.as_u32x8();
1973        let b = b.as_u32x8();
1974        transmute(simd_select::<i32x8, _>(simd_gt(a, b), a, b))
1975    }
1976}
1977
1978/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
1979/// the packed maximum values.
1980///
1981/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_max_epu8)
1982#[inline]
1983#[target_feature(enable = "avx2")]
1984#[cfg_attr(test, assert_instr(vpmaxub))]
1985#[stable(feature = "simd_x86", since = "1.27.0")]
1986pub fn _mm256_max_epu8(a: __m256i, b: __m256i) -> __m256i {
1987    unsafe {
1988        let a = a.as_u8x32();
1989        let b = b.as_u8x32();
1990        transmute(simd_select::<i8x32, _>(simd_gt(a, b), a, b))
1991    }
1992}
1993
1994/// Compares packed 16-bit integers in `a` and `b`, and returns the packed
1995/// minimum values.
1996///
1997/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi16)
1998#[inline]
1999#[target_feature(enable = "avx2")]
2000#[cfg_attr(test, assert_instr(vpminsw))]
2001#[stable(feature = "simd_x86", since = "1.27.0")]
2002pub fn _mm256_min_epi16(a: __m256i, b: __m256i) -> __m256i {
2003    unsafe {
2004        let a = a.as_i16x16();
2005        let b = b.as_i16x16();
2006        transmute(simd_select::<i16x16, _>(simd_lt(a, b), a, b))
2007    }
2008}
2009
2010/// Compares packed 32-bit integers in `a` and `b`, and returns the packed
2011/// minimum values.
2012///
2013/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi32)
2014#[inline]
2015#[target_feature(enable = "avx2")]
2016#[cfg_attr(test, assert_instr(vpminsd))]
2017#[stable(feature = "simd_x86", since = "1.27.0")]
2018pub fn _mm256_min_epi32(a: __m256i, b: __m256i) -> __m256i {
2019    unsafe {
2020        let a = a.as_i32x8();
2021        let b = b.as_i32x8();
2022        transmute(simd_select::<i32x8, _>(simd_lt(a, b), a, b))
2023    }
2024}
2025
2026/// Compares packed 8-bit integers in `a` and `b`, and returns the packed
2027/// minimum values.
2028///
2029/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epi8)
2030#[inline]
2031#[target_feature(enable = "avx2")]
2032#[cfg_attr(test, assert_instr(vpminsb))]
2033#[stable(feature = "simd_x86", since = "1.27.0")]
2034pub fn _mm256_min_epi8(a: __m256i, b: __m256i) -> __m256i {
2035    unsafe {
2036        let a = a.as_i8x32();
2037        let b = b.as_i8x32();
2038        transmute(simd_select::<i8x32, _>(simd_lt(a, b), a, b))
2039    }
2040}
2041
2042/// Compares packed unsigned 16-bit integers in `a` and `b`, and returns
2043/// the packed minimum values.
2044///
2045/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu16)
2046#[inline]
2047#[target_feature(enable = "avx2")]
2048#[cfg_attr(test, assert_instr(vpminuw))]
2049#[stable(feature = "simd_x86", since = "1.27.0")]
2050pub fn _mm256_min_epu16(a: __m256i, b: __m256i) -> __m256i {
2051    unsafe {
2052        let a = a.as_u16x16();
2053        let b = b.as_u16x16();
2054        transmute(simd_select::<i16x16, _>(simd_lt(a, b), a, b))
2055    }
2056}
2057
2058/// Compares packed unsigned 32-bit integers in `a` and `b`, and returns
2059/// the packed minimum values.
2060///
2061/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu32)
2062#[inline]
2063#[target_feature(enable = "avx2")]
2064#[cfg_attr(test, assert_instr(vpminud))]
2065#[stable(feature = "simd_x86", since = "1.27.0")]
2066pub fn _mm256_min_epu32(a: __m256i, b: __m256i) -> __m256i {
2067    unsafe {
2068        let a = a.as_u32x8();
2069        let b = b.as_u32x8();
2070        transmute(simd_select::<i32x8, _>(simd_lt(a, b), a, b))
2071    }
2072}
2073
2074/// Compares packed unsigned 8-bit integers in `a` and `b`, and returns
2075/// the packed minimum values.
2076///
2077/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_min_epu8)
2078#[inline]
2079#[target_feature(enable = "avx2")]
2080#[cfg_attr(test, assert_instr(vpminub))]
2081#[stable(feature = "simd_x86", since = "1.27.0")]
2082pub fn _mm256_min_epu8(a: __m256i, b: __m256i) -> __m256i {
2083    unsafe {
2084        let a = a.as_u8x32();
2085        let b = b.as_u8x32();
2086        transmute(simd_select::<i8x32, _>(simd_lt(a, b), a, b))
2087    }
2088}
2089
2090/// Creates mask from the most significant bit of each 8-bit element in `a`,
2091/// return the result.
2092///
2093/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_movemask_epi8)
2094#[inline]
2095#[target_feature(enable = "avx2")]
2096#[cfg_attr(test, assert_instr(vpmovmskb))]
2097#[stable(feature = "simd_x86", since = "1.27.0")]
2098pub fn _mm256_movemask_epi8(a: __m256i) -> i32 {
2099    unsafe {
2100        let z = i8x32::ZERO;
2101        let m: i8x32 = simd_lt(a.as_i8x32(), z);
2102        simd_bitmask::<_, u32>(m) as i32
2103    }
2104}
2105
2106/// Computes the sum of absolute differences (SADs) of quadruplets of unsigned
2107/// 8-bit integers in `a` compared to those in `b`, and stores the 16-bit
2108/// results in dst. Eight SADs are performed for each 128-bit lane using one
2109/// quadruplet from `b` and eight quadruplets from `a`. One quadruplet is
2110/// selected from `b` starting at on the offset specified in `imm8`. Eight
2111/// quadruplets are formed from sequential 8-bit integers selected from `a`
2112/// starting at the offset specified in `imm8`.
2113///
2114/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mpsadbw_epu8)
2115#[inline]
2116#[target_feature(enable = "avx2")]
2117#[cfg_attr(test, assert_instr(vmpsadbw, IMM8 = 0))]
2118#[rustc_legacy_const_generics(2)]
2119#[stable(feature = "simd_x86", since = "1.27.0")]
2120pub fn _mm256_mpsadbw_epu8<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
2121    static_assert_uimm_bits!(IMM8, 8);
2122    unsafe { transmute(mpsadbw(a.as_u8x32(), b.as_u8x32(), IMM8 as i8)) }
2123}
2124
2125/// Multiplies the low 32-bit integers from each packed 64-bit element in
2126/// `a` and `b`
2127///
2128/// Returns the 64-bit results.
2129///
2130/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_epi32)
2131#[inline]
2132#[target_feature(enable = "avx2")]
2133#[cfg_attr(test, assert_instr(vpmuldq))]
2134#[stable(feature = "simd_x86", since = "1.27.0")]
2135pub fn _mm256_mul_epi32(a: __m256i, b: __m256i) -> __m256i {
2136    unsafe {
2137        let a = simd_cast::<_, i64x4>(simd_cast::<_, i32x4>(a.as_i64x4()));
2138        let b = simd_cast::<_, i64x4>(simd_cast::<_, i32x4>(b.as_i64x4()));
2139        transmute(simd_mul(a, b))
2140    }
2141}
2142
2143/// Multiplies the low unsigned 32-bit integers from each packed 64-bit
2144/// element in `a` and `b`
2145///
2146/// Returns the unsigned 64-bit results.
2147///
2148/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mul_epu32)
2149#[inline]
2150#[target_feature(enable = "avx2")]
2151#[cfg_attr(test, assert_instr(vpmuludq))]
2152#[stable(feature = "simd_x86", since = "1.27.0")]
2153pub fn _mm256_mul_epu32(a: __m256i, b: __m256i) -> __m256i {
2154    unsafe {
2155        let a = a.as_u64x4();
2156        let b = b.as_u64x4();
2157        let mask = u64x4::splat(u32::MAX.into());
2158        transmute(simd_mul(simd_and(a, mask), simd_and(b, mask)))
2159    }
2160}
2161
2162/// Multiplies the packed 16-bit integers in `a` and `b`, producing
2163/// intermediate 32-bit integers and returning the high 16 bits of the
2164/// intermediate integers.
2165///
2166/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhi_epi16)
2167#[inline]
2168#[target_feature(enable = "avx2")]
2169#[cfg_attr(test, assert_instr(vpmulhw))]
2170#[stable(feature = "simd_x86", since = "1.27.0")]
2171pub fn _mm256_mulhi_epi16(a: __m256i, b: __m256i) -> __m256i {
2172    unsafe {
2173        let a = simd_cast::<_, i32x16>(a.as_i16x16());
2174        let b = simd_cast::<_, i32x16>(b.as_i16x16());
2175        let r = simd_shr(simd_mul(a, b), i32x16::splat(16));
2176        transmute(simd_cast::<i32x16, i16x16>(r))
2177    }
2178}
2179
2180/// Multiplies the packed unsigned 16-bit integers in `a` and `b`, producing
2181/// intermediate 32-bit integers and returning the high 16 bits of the
2182/// intermediate integers.
2183///
2184/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhi_epu16)
2185#[inline]
2186#[target_feature(enable = "avx2")]
2187#[cfg_attr(test, assert_instr(vpmulhuw))]
2188#[stable(feature = "simd_x86", since = "1.27.0")]
2189pub fn _mm256_mulhi_epu16(a: __m256i, b: __m256i) -> __m256i {
2190    unsafe {
2191        let a = simd_cast::<_, u32x16>(a.as_u16x16());
2192        let b = simd_cast::<_, u32x16>(b.as_u16x16());
2193        let r = simd_shr(simd_mul(a, b), u32x16::splat(16));
2194        transmute(simd_cast::<u32x16, u16x16>(r))
2195    }
2196}
2197
2198/// Multiplies the packed 16-bit integers in `a` and `b`, producing
2199/// intermediate 32-bit integers, and returns the low 16 bits of the
2200/// intermediate integers
2201///
2202/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mullo_epi16)
2203#[inline]
2204#[target_feature(enable = "avx2")]
2205#[cfg_attr(test, assert_instr(vpmullw))]
2206#[stable(feature = "simd_x86", since = "1.27.0")]
2207pub fn _mm256_mullo_epi16(a: __m256i, b: __m256i) -> __m256i {
2208    unsafe { transmute(simd_mul(a.as_i16x16(), b.as_i16x16())) }
2209}
2210
2211/// Multiplies the packed 32-bit integers in `a` and `b`, producing
2212/// intermediate 64-bit integers, and returns the low 32 bits of the
2213/// intermediate integers
2214///
2215/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mullo_epi32)
2216#[inline]
2217#[target_feature(enable = "avx2")]
2218#[cfg_attr(test, assert_instr(vpmulld))]
2219#[stable(feature = "simd_x86", since = "1.27.0")]
2220pub fn _mm256_mullo_epi32(a: __m256i, b: __m256i) -> __m256i {
2221    unsafe { transmute(simd_mul(a.as_i32x8(), b.as_i32x8())) }
2222}
2223
2224/// Multiplies packed 16-bit integers in `a` and `b`, producing
2225/// intermediate signed 32-bit integers. Truncate each intermediate
2226/// integer to the 18 most significant bits, round by adding 1, and
2227/// return bits `[16:1]`.
2228///
2229/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_mulhrs_epi16)
2230#[inline]
2231#[target_feature(enable = "avx2")]
2232#[cfg_attr(test, assert_instr(vpmulhrsw))]
2233#[stable(feature = "simd_x86", since = "1.27.0")]
2234pub fn _mm256_mulhrs_epi16(a: __m256i, b: __m256i) -> __m256i {
2235    unsafe { transmute(pmulhrsw(a.as_i16x16(), b.as_i16x16())) }
2236}
2237
2238/// Computes the bitwise OR of 256 bits (representing integer data) in `a`
2239/// and `b`
2240///
2241/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_or_si256)
2242#[inline]
2243#[target_feature(enable = "avx2")]
2244#[cfg_attr(test, assert_instr(vorps))]
2245#[stable(feature = "simd_x86", since = "1.27.0")]
2246pub fn _mm256_or_si256(a: __m256i, b: __m256i) -> __m256i {
2247    unsafe { transmute(simd_or(a.as_i32x8(), b.as_i32x8())) }
2248}
2249
2250/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
2251/// using signed saturation
2252///
2253/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi16)
2254#[inline]
2255#[target_feature(enable = "avx2")]
2256#[cfg_attr(test, assert_instr(vpacksswb))]
2257#[stable(feature = "simd_x86", since = "1.27.0")]
2258pub fn _mm256_packs_epi16(a: __m256i, b: __m256i) -> __m256i {
2259    unsafe { transmute(packsswb(a.as_i16x16(), b.as_i16x16())) }
2260}
2261
2262/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
2263/// using signed saturation
2264///
2265/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packs_epi32)
2266#[inline]
2267#[target_feature(enable = "avx2")]
2268#[cfg_attr(test, assert_instr(vpackssdw))]
2269#[stable(feature = "simd_x86", since = "1.27.0")]
2270pub fn _mm256_packs_epi32(a: __m256i, b: __m256i) -> __m256i {
2271    unsafe { transmute(packssdw(a.as_i32x8(), b.as_i32x8())) }
2272}
2273
2274/// Converts packed 16-bit integers from `a` and `b` to packed 8-bit integers
2275/// using unsigned saturation
2276///
2277/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi16)
2278#[inline]
2279#[target_feature(enable = "avx2")]
2280#[cfg_attr(test, assert_instr(vpackuswb))]
2281#[stable(feature = "simd_x86", since = "1.27.0")]
2282pub fn _mm256_packus_epi16(a: __m256i, b: __m256i) -> __m256i {
2283    unsafe { transmute(packuswb(a.as_i16x16(), b.as_i16x16())) }
2284}
2285
2286/// Converts packed 32-bit integers from `a` and `b` to packed 16-bit integers
2287/// using unsigned saturation
2288///
2289/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_packus_epi32)
2290#[inline]
2291#[target_feature(enable = "avx2")]
2292#[cfg_attr(test, assert_instr(vpackusdw))]
2293#[stable(feature = "simd_x86", since = "1.27.0")]
2294pub fn _mm256_packus_epi32(a: __m256i, b: __m256i) -> __m256i {
2295    unsafe { transmute(packusdw(a.as_i32x8(), b.as_i32x8())) }
2296}
2297
2298/// Permutes packed 32-bit integers from `a` according to the content of `b`.
2299///
2300/// The last 3 bits of each integer of `b` are used as addresses into the 8
2301/// integers of `a`.
2302///
2303/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar8x32_epi32)
2304#[inline]
2305#[target_feature(enable = "avx2")]
2306#[cfg_attr(test, assert_instr(vpermps))]
2307#[stable(feature = "simd_x86", since = "1.27.0")]
2308pub fn _mm256_permutevar8x32_epi32(a: __m256i, b: __m256i) -> __m256i {
2309    unsafe { transmute(permd(a.as_u32x8(), b.as_u32x8())) }
2310}
2311
2312/// Permutes 64-bit integers from `a` using control mask `imm8`.
2313///
2314/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute4x64_epi64)
2315#[inline]
2316#[target_feature(enable = "avx2")]
2317#[cfg_attr(test, assert_instr(vpermpd, IMM8 = 9))]
2318#[rustc_legacy_const_generics(1)]
2319#[stable(feature = "simd_x86", since = "1.27.0")]
2320pub fn _mm256_permute4x64_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
2321    static_assert_uimm_bits!(IMM8, 8);
2322    unsafe {
2323        let zero = i64x4::ZERO;
2324        let r: i64x4 = simd_shuffle!(
2325            a.as_i64x4(),
2326            zero,
2327            [
2328                IMM8 as u32 & 0b11,
2329                (IMM8 as u32 >> 2) & 0b11,
2330                (IMM8 as u32 >> 4) & 0b11,
2331                (IMM8 as u32 >> 6) & 0b11,
2332            ],
2333        );
2334        transmute(r)
2335    }
2336}
2337
2338/// Shuffles 128-bits of integer data selected by `imm8` from `a` and `b`.
2339///
2340/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute2x128_si256)
2341#[inline]
2342#[target_feature(enable = "avx2")]
2343#[cfg_attr(test, assert_instr(vperm2f128, IMM8 = 9))]
2344#[rustc_legacy_const_generics(2)]
2345#[stable(feature = "simd_x86", since = "1.27.0")]
2346pub fn _mm256_permute2x128_si256<const IMM8: i32>(a: __m256i, b: __m256i) -> __m256i {
2347    static_assert_uimm_bits!(IMM8, 8);
2348    _mm256_permute2f128_si256::<IMM8>(a, b)
2349}
2350
2351/// Shuffles 64-bit floating-point elements in `a` across lanes using the
2352/// control in `imm8`.
2353///
2354/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permute4x64_pd)
2355#[inline]
2356#[target_feature(enable = "avx2")]
2357#[cfg_attr(test, assert_instr(vpermpd, IMM8 = 1))]
2358#[rustc_legacy_const_generics(1)]
2359#[stable(feature = "simd_x86", since = "1.27.0")]
2360pub fn _mm256_permute4x64_pd<const IMM8: i32>(a: __m256d) -> __m256d {
2361    static_assert_uimm_bits!(IMM8, 8);
2362    unsafe {
2363        simd_shuffle!(
2364            a,
2365            _mm256_undefined_pd(),
2366            [
2367                IMM8 as u32 & 0b11,
2368                (IMM8 as u32 >> 2) & 0b11,
2369                (IMM8 as u32 >> 4) & 0b11,
2370                (IMM8 as u32 >> 6) & 0b11,
2371            ],
2372        )
2373    }
2374}
2375
2376/// Shuffles eight 32-bit floating-point elements in `a` across lanes using
2377/// the corresponding 32-bit integer index in `idx`.
2378///
2379/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_permutevar8x32_ps)
2380#[inline]
2381#[target_feature(enable = "avx2")]
2382#[cfg_attr(test, assert_instr(vpermps))]
2383#[stable(feature = "simd_x86", since = "1.27.0")]
2384pub fn _mm256_permutevar8x32_ps(a: __m256, idx: __m256i) -> __m256 {
2385    unsafe { permps(a, idx.as_i32x8()) }
2386}
2387
2388/// Computes the absolute differences of packed unsigned 8-bit integers in `a`
2389/// and `b`, then horizontally sum each consecutive 8 differences to
2390/// produce four unsigned 16-bit integers, and pack these unsigned 16-bit
2391/// integers in the low 16 bits of the 64-bit return value
2392///
2393/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sad_epu8)
2394#[inline]
2395#[target_feature(enable = "avx2")]
2396#[cfg_attr(test, assert_instr(vpsadbw))]
2397#[stable(feature = "simd_x86", since = "1.27.0")]
2398pub fn _mm256_sad_epu8(a: __m256i, b: __m256i) -> __m256i {
2399    unsafe { transmute(psadbw(a.as_u8x32(), b.as_u8x32())) }
2400}
2401
2402/// Shuffles bytes from `a` according to the content of `b`.
2403///
2404/// For each of the 128-bit low and high halves of the vectors, the last
2405/// 4 bits of each byte of `b` are used as addresses into the respective
2406/// low or high 16 bytes of `a`. That is, the halves are shuffled separately.
2407///
2408/// In addition, if the highest significant bit of a byte of `b` is set, the
2409/// respective destination byte is set to 0.
2410///
2411/// Picturing `a` and `b` as `[u8; 32]`, `_mm256_shuffle_epi8` is logically
2412/// equivalent to:
2413///
2414/// ```
2415/// fn mm256_shuffle_epi8(a: [u8; 32], b: [u8; 32]) -> [u8; 32] {
2416///     let mut r = [0; 32];
2417///     for i in 0..16 {
2418///         // if the most significant bit of b is set,
2419///         // then the destination byte is set to 0.
2420///         if b[i] & 0x80 == 0u8 {
2421///             r[i] = a[(b[i] % 16) as usize];
2422///         }
2423///         if b[i + 16] & 0x80 == 0u8 {
2424///             r[i + 16] = a[(b[i + 16] % 16 + 16) as usize];
2425///         }
2426///     }
2427///     r
2428/// }
2429/// ```
2430///
2431/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_epi8)
2432#[inline]
2433#[target_feature(enable = "avx2")]
2434#[cfg_attr(test, assert_instr(vpshufb))]
2435#[stable(feature = "simd_x86", since = "1.27.0")]
2436pub fn _mm256_shuffle_epi8(a: __m256i, b: __m256i) -> __m256i {
2437    unsafe { transmute(pshufb(a.as_u8x32(), b.as_u8x32())) }
2438}
2439
2440/// Shuffles 32-bit integers in 128-bit lanes of `a` using the control in
2441/// `imm8`.
2442///
2443/// ```rust
2444/// #[cfg(target_arch = "x86")]
2445/// use std::arch::x86::*;
2446/// #[cfg(target_arch = "x86_64")]
2447/// use std::arch::x86_64::*;
2448///
2449/// # fn main() {
2450/// #     if is_x86_feature_detected!("avx2") {
2451/// #         #[target_feature(enable = "avx2")]
2452/// #         unsafe fn worker() {
2453/// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
2454///
2455/// let c1 = _mm256_shuffle_epi32(a, 0b00_11_10_01);
2456/// let c2 = _mm256_shuffle_epi32(a, 0b01_00_10_11);
2457///
2458/// let expected1 = _mm256_setr_epi32(1, 2, 3, 0, 5, 6, 7, 4);
2459/// let expected2 = _mm256_setr_epi32(3, 2, 0, 1, 7, 6, 4, 5);
2460///
2461/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c1, expected1)), !0);
2462/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c2, expected2)), !0);
2463/// #         }
2464/// #         unsafe { worker(); }
2465/// #     }
2466/// # }
2467/// ```
2468///
2469/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shuffle_epi32)
2470#[inline]
2471#[target_feature(enable = "avx2")]
2472#[cfg_attr(test, assert_instr(vshufps, MASK = 9))]
2473#[rustc_legacy_const_generics(1)]
2474#[stable(feature = "simd_x86", since = "1.27.0")]
2475pub fn _mm256_shuffle_epi32<const MASK: i32>(a: __m256i) -> __m256i {
2476    static_assert_uimm_bits!(MASK, 8);
2477    unsafe {
2478        let r: i32x8 = simd_shuffle!(
2479            a.as_i32x8(),
2480            a.as_i32x8(),
2481            [
2482                MASK as u32 & 0b11,
2483                (MASK as u32 >> 2) & 0b11,
2484                (MASK as u32 >> 4) & 0b11,
2485                (MASK as u32 >> 6) & 0b11,
2486                (MASK as u32 & 0b11) + 4,
2487                ((MASK as u32 >> 2) & 0b11) + 4,
2488                ((MASK as u32 >> 4) & 0b11) + 4,
2489                ((MASK as u32 >> 6) & 0b11) + 4,
2490            ],
2491        );
2492        transmute(r)
2493    }
2494}
2495
2496/// Shuffles 16-bit integers in the high 64 bits of 128-bit lanes of `a` using
2497/// the control in `imm8`. The low 64 bits of 128-bit lanes of `a` are copied
2498/// to the output.
2499///
2500/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shufflehi_epi16)
2501#[inline]
2502#[target_feature(enable = "avx2")]
2503#[cfg_attr(test, assert_instr(vpshufhw, IMM8 = 9))]
2504#[rustc_legacy_const_generics(1)]
2505#[stable(feature = "simd_x86", since = "1.27.0")]
2506pub fn _mm256_shufflehi_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
2507    static_assert_uimm_bits!(IMM8, 8);
2508    unsafe {
2509        let a = a.as_i16x16();
2510        let r: i16x16 = simd_shuffle!(
2511            a,
2512            a,
2513            [
2514                0,
2515                1,
2516                2,
2517                3,
2518                4 + (IMM8 as u32 & 0b11),
2519                4 + ((IMM8 as u32 >> 2) & 0b11),
2520                4 + ((IMM8 as u32 >> 4) & 0b11),
2521                4 + ((IMM8 as u32 >> 6) & 0b11),
2522                8,
2523                9,
2524                10,
2525                11,
2526                12 + (IMM8 as u32 & 0b11),
2527                12 + ((IMM8 as u32 >> 2) & 0b11),
2528                12 + ((IMM8 as u32 >> 4) & 0b11),
2529                12 + ((IMM8 as u32 >> 6) & 0b11),
2530            ],
2531        );
2532        transmute(r)
2533    }
2534}
2535
2536/// Shuffles 16-bit integers in the low 64 bits of 128-bit lanes of `a` using
2537/// the control in `imm8`. The high 64 bits of 128-bit lanes of `a` are copied
2538/// to the output.
2539///
2540/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_shufflelo_epi16)
2541#[inline]
2542#[target_feature(enable = "avx2")]
2543#[cfg_attr(test, assert_instr(vpshuflw, IMM8 = 9))]
2544#[rustc_legacy_const_generics(1)]
2545#[stable(feature = "simd_x86", since = "1.27.0")]
2546pub fn _mm256_shufflelo_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
2547    static_assert_uimm_bits!(IMM8, 8);
2548    unsafe {
2549        let a = a.as_i16x16();
2550        let r: i16x16 = simd_shuffle!(
2551            a,
2552            a,
2553            [
2554                0 + (IMM8 as u32 & 0b11),
2555                0 + ((IMM8 as u32 >> 2) & 0b11),
2556                0 + ((IMM8 as u32 >> 4) & 0b11),
2557                0 + ((IMM8 as u32 >> 6) & 0b11),
2558                4,
2559                5,
2560                6,
2561                7,
2562                8 + (IMM8 as u32 & 0b11),
2563                8 + ((IMM8 as u32 >> 2) & 0b11),
2564                8 + ((IMM8 as u32 >> 4) & 0b11),
2565                8 + ((IMM8 as u32 >> 6) & 0b11),
2566                12,
2567                13,
2568                14,
2569                15,
2570            ],
2571        );
2572        transmute(r)
2573    }
2574}
2575
2576/// Negates packed 16-bit integers in `a` when the corresponding signed
2577/// 16-bit integer in `b` is negative, and returns the results.
2578/// Results are zeroed out when the corresponding element in `b` is zero.
2579///
2580/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi16)
2581#[inline]
2582#[target_feature(enable = "avx2")]
2583#[cfg_attr(test, assert_instr(vpsignw))]
2584#[stable(feature = "simd_x86", since = "1.27.0")]
2585pub fn _mm256_sign_epi16(a: __m256i, b: __m256i) -> __m256i {
2586    unsafe { transmute(psignw(a.as_i16x16(), b.as_i16x16())) }
2587}
2588
2589/// Negates packed 32-bit integers in `a` when the corresponding signed
2590/// 32-bit integer in `b` is negative, and returns the results.
2591/// Results are zeroed out when the corresponding element in `b` is zero.
2592///
2593/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi32)
2594#[inline]
2595#[target_feature(enable = "avx2")]
2596#[cfg_attr(test, assert_instr(vpsignd))]
2597#[stable(feature = "simd_x86", since = "1.27.0")]
2598pub fn _mm256_sign_epi32(a: __m256i, b: __m256i) -> __m256i {
2599    unsafe { transmute(psignd(a.as_i32x8(), b.as_i32x8())) }
2600}
2601
2602/// Negates packed 8-bit integers in `a` when the corresponding signed
2603/// 8-bit integer in `b` is negative, and returns the results.
2604/// Results are zeroed out when the corresponding element in `b` is zero.
2605///
2606/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sign_epi8)
2607#[inline]
2608#[target_feature(enable = "avx2")]
2609#[cfg_attr(test, assert_instr(vpsignb))]
2610#[stable(feature = "simd_x86", since = "1.27.0")]
2611pub fn _mm256_sign_epi8(a: __m256i, b: __m256i) -> __m256i {
2612    unsafe { transmute(psignb(a.as_i8x32(), b.as_i8x32())) }
2613}
2614
2615/// Shifts packed 16-bit integers in `a` left by `count` while
2616/// shifting in zeros, and returns the result
2617///
2618/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi16)
2619#[inline]
2620#[target_feature(enable = "avx2")]
2621#[cfg_attr(test, assert_instr(vpsllw))]
2622#[stable(feature = "simd_x86", since = "1.27.0")]
2623pub fn _mm256_sll_epi16(a: __m256i, count: __m128i) -> __m256i {
2624    unsafe { transmute(psllw(a.as_i16x16(), count.as_i16x8())) }
2625}
2626
2627/// Shifts packed 32-bit integers in `a` left by `count` while
2628/// shifting in zeros, and returns the result
2629///
2630/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi32)
2631#[inline]
2632#[target_feature(enable = "avx2")]
2633#[cfg_attr(test, assert_instr(vpslld))]
2634#[stable(feature = "simd_x86", since = "1.27.0")]
2635pub fn _mm256_sll_epi32(a: __m256i, count: __m128i) -> __m256i {
2636    unsafe { transmute(pslld(a.as_i32x8(), count.as_i32x4())) }
2637}
2638
2639/// Shifts packed 64-bit integers in `a` left by `count` while
2640/// shifting in zeros, and returns the result
2641///
2642/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sll_epi64)
2643#[inline]
2644#[target_feature(enable = "avx2")]
2645#[cfg_attr(test, assert_instr(vpsllq))]
2646#[stable(feature = "simd_x86", since = "1.27.0")]
2647pub fn _mm256_sll_epi64(a: __m256i, count: __m128i) -> __m256i {
2648    unsafe { transmute(psllq(a.as_i64x4(), count.as_i64x2())) }
2649}
2650
2651/// Shifts packed 16-bit integers in `a` left by `IMM8` while
2652/// shifting in zeros, return the results;
2653///
2654/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi16)
2655#[inline]
2656#[target_feature(enable = "avx2")]
2657#[cfg_attr(test, assert_instr(vpsllw, IMM8 = 7))]
2658#[rustc_legacy_const_generics(1)]
2659#[stable(feature = "simd_x86", since = "1.27.0")]
2660pub fn _mm256_slli_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
2661    static_assert_uimm_bits!(IMM8, 8);
2662    unsafe {
2663        if IMM8 >= 16 {
2664            _mm256_setzero_si256()
2665        } else {
2666            transmute(simd_shl(a.as_u16x16(), u16x16::splat(IMM8 as u16)))
2667        }
2668    }
2669}
2670
2671/// Shifts packed 32-bit integers in `a` left by `IMM8` while
2672/// shifting in zeros, return the results;
2673///
2674/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi32)
2675#[inline]
2676#[target_feature(enable = "avx2")]
2677#[cfg_attr(test, assert_instr(vpslld, IMM8 = 7))]
2678#[rustc_legacy_const_generics(1)]
2679#[stable(feature = "simd_x86", since = "1.27.0")]
2680pub fn _mm256_slli_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
2681    unsafe {
2682        static_assert_uimm_bits!(IMM8, 8);
2683        if IMM8 >= 32 {
2684            _mm256_setzero_si256()
2685        } else {
2686            transmute(simd_shl(a.as_u32x8(), u32x8::splat(IMM8 as u32)))
2687        }
2688    }
2689}
2690
2691/// Shifts packed 64-bit integers in `a` left by `IMM8` while
2692/// shifting in zeros, return the results;
2693///
2694/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_epi64)
2695#[inline]
2696#[target_feature(enable = "avx2")]
2697#[cfg_attr(test, assert_instr(vpsllq, IMM8 = 7))]
2698#[rustc_legacy_const_generics(1)]
2699#[stable(feature = "simd_x86", since = "1.27.0")]
2700pub fn _mm256_slli_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
2701    unsafe {
2702        static_assert_uimm_bits!(IMM8, 8);
2703        if IMM8 >= 64 {
2704            _mm256_setzero_si256()
2705        } else {
2706            transmute(simd_shl(a.as_u64x4(), u64x4::splat(IMM8 as u64)))
2707        }
2708    }
2709}
2710
2711/// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
2712///
2713/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_slli_si256)
2714#[inline]
2715#[target_feature(enable = "avx2")]
2716#[cfg_attr(test, assert_instr(vpslldq, IMM8 = 3))]
2717#[rustc_legacy_const_generics(1)]
2718#[stable(feature = "simd_x86", since = "1.27.0")]
2719pub fn _mm256_slli_si256<const IMM8: i32>(a: __m256i) -> __m256i {
2720    static_assert_uimm_bits!(IMM8, 8);
2721    _mm256_bslli_epi128::<IMM8>(a)
2722}
2723
2724/// Shifts 128-bit lanes in `a` left by `imm8` bytes while shifting in zeros.
2725///
2726/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bslli_epi128)
2727#[inline]
2728#[target_feature(enable = "avx2")]
2729#[cfg_attr(test, assert_instr(vpslldq, IMM8 = 3))]
2730#[rustc_legacy_const_generics(1)]
2731#[stable(feature = "simd_x86", since = "1.27.0")]
2732pub fn _mm256_bslli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
2733    static_assert_uimm_bits!(IMM8, 8);
2734    const fn mask(shift: i32, i: u32) -> u32 {
2735        let shift = shift as u32 & 0xff;
2736        if shift > 15 || i % 16 < shift {
2737            0
2738        } else {
2739            32 + (i - shift)
2740        }
2741    }
2742    unsafe {
2743        let a = a.as_i8x32();
2744        let r: i8x32 = simd_shuffle!(
2745            i8x32::ZERO,
2746            a,
2747            [
2748                mask(IMM8, 0),
2749                mask(IMM8, 1),
2750                mask(IMM8, 2),
2751                mask(IMM8, 3),
2752                mask(IMM8, 4),
2753                mask(IMM8, 5),
2754                mask(IMM8, 6),
2755                mask(IMM8, 7),
2756                mask(IMM8, 8),
2757                mask(IMM8, 9),
2758                mask(IMM8, 10),
2759                mask(IMM8, 11),
2760                mask(IMM8, 12),
2761                mask(IMM8, 13),
2762                mask(IMM8, 14),
2763                mask(IMM8, 15),
2764                mask(IMM8, 16),
2765                mask(IMM8, 17),
2766                mask(IMM8, 18),
2767                mask(IMM8, 19),
2768                mask(IMM8, 20),
2769                mask(IMM8, 21),
2770                mask(IMM8, 22),
2771                mask(IMM8, 23),
2772                mask(IMM8, 24),
2773                mask(IMM8, 25),
2774                mask(IMM8, 26),
2775                mask(IMM8, 27),
2776                mask(IMM8, 28),
2777                mask(IMM8, 29),
2778                mask(IMM8, 30),
2779                mask(IMM8, 31),
2780            ],
2781        );
2782        transmute(r)
2783    }
2784}
2785
2786/// Shifts packed 32-bit integers in `a` left by the amount
2787/// specified by the corresponding element in `count` while
2788/// shifting in zeros, and returns the result.
2789///
2790/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sllv_epi32)
2791#[inline]
2792#[target_feature(enable = "avx2")]
2793#[cfg_attr(test, assert_instr(vpsllvd))]
2794#[stable(feature = "simd_x86", since = "1.27.0")]
2795pub fn _mm_sllv_epi32(a: __m128i, count: __m128i) -> __m128i {
2796    unsafe {
2797        let count = count.as_u32x4();
2798        let no_overflow: u32x4 = simd_lt(count, u32x4::splat(u32::BITS));
2799        let count = simd_select(no_overflow, count, u32x4::ZERO);
2800        simd_select(no_overflow, simd_shl(a.as_u32x4(), count), u32x4::ZERO).as_m128i()
2801    }
2802}
2803
2804/// Shifts packed 32-bit integers in `a` left by the amount
2805/// specified by the corresponding element in `count` while
2806/// shifting in zeros, and returns the result.
2807///
2808/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sllv_epi32)
2809#[inline]
2810#[target_feature(enable = "avx2")]
2811#[cfg_attr(test, assert_instr(vpsllvd))]
2812#[stable(feature = "simd_x86", since = "1.27.0")]
2813pub fn _mm256_sllv_epi32(a: __m256i, count: __m256i) -> __m256i {
2814    unsafe {
2815        let count = count.as_u32x8();
2816        let no_overflow: u32x8 = simd_lt(count, u32x8::splat(u32::BITS));
2817        let count = simd_select(no_overflow, count, u32x8::ZERO);
2818        simd_select(no_overflow, simd_shl(a.as_u32x8(), count), u32x8::ZERO).as_m256i()
2819    }
2820}
2821
2822/// Shifts packed 64-bit integers in `a` left by the amount
2823/// specified by the corresponding element in `count` while
2824/// shifting in zeros, and returns the result.
2825///
2826/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sllv_epi64)
2827#[inline]
2828#[target_feature(enable = "avx2")]
2829#[cfg_attr(test, assert_instr(vpsllvq))]
2830#[stable(feature = "simd_x86", since = "1.27.0")]
2831pub fn _mm_sllv_epi64(a: __m128i, count: __m128i) -> __m128i {
2832    unsafe {
2833        let count = count.as_u64x2();
2834        let no_overflow: u64x2 = simd_lt(count, u64x2::splat(u64::BITS as u64));
2835        let count = simd_select(no_overflow, count, u64x2::ZERO);
2836        simd_select(no_overflow, simd_shl(a.as_u64x2(), count), u64x2::ZERO).as_m128i()
2837    }
2838}
2839
2840/// Shifts packed 64-bit integers in `a` left by the amount
2841/// specified by the corresponding element in `count` while
2842/// shifting in zeros, and returns the result.
2843///
2844/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sllv_epi64)
2845#[inline]
2846#[target_feature(enable = "avx2")]
2847#[cfg_attr(test, assert_instr(vpsllvq))]
2848#[stable(feature = "simd_x86", since = "1.27.0")]
2849pub fn _mm256_sllv_epi64(a: __m256i, count: __m256i) -> __m256i {
2850    unsafe {
2851        let count = count.as_u64x4();
2852        let no_overflow: u64x4 = simd_lt(count, u64x4::splat(u64::BITS as u64));
2853        let count = simd_select(no_overflow, count, u64x4::ZERO);
2854        simd_select(no_overflow, simd_shl(a.as_u64x4(), count), u64x4::ZERO).as_m256i()
2855    }
2856}
2857
2858/// Shifts packed 16-bit integers in `a` right by `count` while
2859/// shifting in sign bits.
2860///
2861/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sra_epi16)
2862#[inline]
2863#[target_feature(enable = "avx2")]
2864#[cfg_attr(test, assert_instr(vpsraw))]
2865#[stable(feature = "simd_x86", since = "1.27.0")]
2866pub fn _mm256_sra_epi16(a: __m256i, count: __m128i) -> __m256i {
2867    unsafe { transmute(psraw(a.as_i16x16(), count.as_i16x8())) }
2868}
2869
2870/// Shifts packed 32-bit integers in `a` right by `count` while
2871/// shifting in sign bits.
2872///
2873/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sra_epi32)
2874#[inline]
2875#[target_feature(enable = "avx2")]
2876#[cfg_attr(test, assert_instr(vpsrad))]
2877#[stable(feature = "simd_x86", since = "1.27.0")]
2878pub fn _mm256_sra_epi32(a: __m256i, count: __m128i) -> __m256i {
2879    unsafe { transmute(psrad(a.as_i32x8(), count.as_i32x4())) }
2880}
2881
2882/// Shifts packed 16-bit integers in `a` right by `IMM8` while
2883/// shifting in sign bits.
2884///
2885/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srai_epi16)
2886#[inline]
2887#[target_feature(enable = "avx2")]
2888#[cfg_attr(test, assert_instr(vpsraw, IMM8 = 7))]
2889#[rustc_legacy_const_generics(1)]
2890#[stable(feature = "simd_x86", since = "1.27.0")]
2891pub fn _mm256_srai_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
2892    static_assert_uimm_bits!(IMM8, 8);
2893    unsafe { transmute(simd_shr(a.as_i16x16(), i16x16::splat(IMM8.min(15) as i16))) }
2894}
2895
2896/// Shifts packed 32-bit integers in `a` right by `IMM8` while
2897/// shifting in sign bits.
2898///
2899/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srai_epi32)
2900#[inline]
2901#[target_feature(enable = "avx2")]
2902#[cfg_attr(test, assert_instr(vpsrad, IMM8 = 7))]
2903#[rustc_legacy_const_generics(1)]
2904#[stable(feature = "simd_x86", since = "1.27.0")]
2905pub fn _mm256_srai_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
2906    static_assert_uimm_bits!(IMM8, 8);
2907    unsafe { transmute(simd_shr(a.as_i32x8(), i32x8::splat(IMM8.min(31)))) }
2908}
2909
2910/// Shifts packed 32-bit integers in `a` right by the amount specified by the
2911/// corresponding element in `count` while shifting in sign bits.
2912///
2913/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srav_epi32)
2914#[inline]
2915#[target_feature(enable = "avx2")]
2916#[cfg_attr(test, assert_instr(vpsravd))]
2917#[stable(feature = "simd_x86", since = "1.27.0")]
2918pub fn _mm_srav_epi32(a: __m128i, count: __m128i) -> __m128i {
2919    unsafe {
2920        let count = count.as_u32x4();
2921        let no_overflow: u32x4 = simd_lt(count, u32x4::splat(u32::BITS));
2922        let count = simd_select(no_overflow, transmute(count), i32x4::splat(31));
2923        simd_shr(a.as_i32x4(), count).as_m128i()
2924    }
2925}
2926
2927/// Shifts packed 32-bit integers in `a` right by the amount specified by the
2928/// corresponding element in `count` while shifting in sign bits.
2929///
2930/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srav_epi32)
2931#[inline]
2932#[target_feature(enable = "avx2")]
2933#[cfg_attr(test, assert_instr(vpsravd))]
2934#[stable(feature = "simd_x86", since = "1.27.0")]
2935pub fn _mm256_srav_epi32(a: __m256i, count: __m256i) -> __m256i {
2936    unsafe {
2937        let count = count.as_u32x8();
2938        let no_overflow: u32x8 = simd_lt(count, u32x8::splat(u32::BITS));
2939        let count = simd_select(no_overflow, transmute(count), i32x8::splat(31));
2940        simd_shr(a.as_i32x8(), count).as_m256i()
2941    }
2942}
2943
2944/// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
2945///
2946/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_si256)
2947#[inline]
2948#[target_feature(enable = "avx2")]
2949#[cfg_attr(test, assert_instr(vpsrldq, IMM8 = 1))]
2950#[rustc_legacy_const_generics(1)]
2951#[stable(feature = "simd_x86", since = "1.27.0")]
2952pub fn _mm256_srli_si256<const IMM8: i32>(a: __m256i) -> __m256i {
2953    static_assert_uimm_bits!(IMM8, 8);
2954    _mm256_bsrli_epi128::<IMM8>(a)
2955}
2956
2957/// Shifts 128-bit lanes in `a` right by `imm8` bytes while shifting in zeros.
2958///
2959/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_bsrli_epi128)
2960#[inline]
2961#[target_feature(enable = "avx2")]
2962#[cfg_attr(test, assert_instr(vpsrldq, IMM8 = 1))]
2963#[rustc_legacy_const_generics(1)]
2964#[stable(feature = "simd_x86", since = "1.27.0")]
2965pub fn _mm256_bsrli_epi128<const IMM8: i32>(a: __m256i) -> __m256i {
2966    static_assert_uimm_bits!(IMM8, 8);
2967    const fn mask(shift: i32, i: u32) -> u32 {
2968        let shift = shift as u32 & 0xff;
2969        if shift > 15 || (15 - (i % 16)) < shift {
2970            0
2971        } else {
2972            32 + (i + shift)
2973        }
2974    }
2975    unsafe {
2976        let a = a.as_i8x32();
2977        let r: i8x32 = simd_shuffle!(
2978            i8x32::ZERO,
2979            a,
2980            [
2981                mask(IMM8, 0),
2982                mask(IMM8, 1),
2983                mask(IMM8, 2),
2984                mask(IMM8, 3),
2985                mask(IMM8, 4),
2986                mask(IMM8, 5),
2987                mask(IMM8, 6),
2988                mask(IMM8, 7),
2989                mask(IMM8, 8),
2990                mask(IMM8, 9),
2991                mask(IMM8, 10),
2992                mask(IMM8, 11),
2993                mask(IMM8, 12),
2994                mask(IMM8, 13),
2995                mask(IMM8, 14),
2996                mask(IMM8, 15),
2997                mask(IMM8, 16),
2998                mask(IMM8, 17),
2999                mask(IMM8, 18),
3000                mask(IMM8, 19),
3001                mask(IMM8, 20),
3002                mask(IMM8, 21),
3003                mask(IMM8, 22),
3004                mask(IMM8, 23),
3005                mask(IMM8, 24),
3006                mask(IMM8, 25),
3007                mask(IMM8, 26),
3008                mask(IMM8, 27),
3009                mask(IMM8, 28),
3010                mask(IMM8, 29),
3011                mask(IMM8, 30),
3012                mask(IMM8, 31),
3013            ],
3014        );
3015        transmute(r)
3016    }
3017}
3018
3019/// Shifts packed 16-bit integers in `a` right by `count` while shifting in
3020/// zeros.
3021///
3022/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi16)
3023#[inline]
3024#[target_feature(enable = "avx2")]
3025#[cfg_attr(test, assert_instr(vpsrlw))]
3026#[stable(feature = "simd_x86", since = "1.27.0")]
3027pub fn _mm256_srl_epi16(a: __m256i, count: __m128i) -> __m256i {
3028    unsafe { transmute(psrlw(a.as_i16x16(), count.as_i16x8())) }
3029}
3030
3031/// Shifts packed 32-bit integers in `a` right by `count` while shifting in
3032/// zeros.
3033///
3034/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi32)
3035#[inline]
3036#[target_feature(enable = "avx2")]
3037#[cfg_attr(test, assert_instr(vpsrld))]
3038#[stable(feature = "simd_x86", since = "1.27.0")]
3039pub fn _mm256_srl_epi32(a: __m256i, count: __m128i) -> __m256i {
3040    unsafe { transmute(psrld(a.as_i32x8(), count.as_i32x4())) }
3041}
3042
3043/// Shifts packed 64-bit integers in `a` right by `count` while shifting in
3044/// zeros.
3045///
3046/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srl_epi64)
3047#[inline]
3048#[target_feature(enable = "avx2")]
3049#[cfg_attr(test, assert_instr(vpsrlq))]
3050#[stable(feature = "simd_x86", since = "1.27.0")]
3051pub fn _mm256_srl_epi64(a: __m256i, count: __m128i) -> __m256i {
3052    unsafe { transmute(psrlq(a.as_i64x4(), count.as_i64x2())) }
3053}
3054
3055/// Shifts packed 16-bit integers in `a` right by `IMM8` while shifting in
3056/// zeros
3057///
3058/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi16)
3059#[inline]
3060#[target_feature(enable = "avx2")]
3061#[cfg_attr(test, assert_instr(vpsrlw, IMM8 = 7))]
3062#[rustc_legacy_const_generics(1)]
3063#[stable(feature = "simd_x86", since = "1.27.0")]
3064pub fn _mm256_srli_epi16<const IMM8: i32>(a: __m256i) -> __m256i {
3065    static_assert_uimm_bits!(IMM8, 8);
3066    unsafe {
3067        if IMM8 >= 16 {
3068            _mm256_setzero_si256()
3069        } else {
3070            transmute(simd_shr(a.as_u16x16(), u16x16::splat(IMM8 as u16)))
3071        }
3072    }
3073}
3074
3075/// Shifts packed 32-bit integers in `a` right by `IMM8` while shifting in
3076/// zeros
3077///
3078/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi32)
3079#[inline]
3080#[target_feature(enable = "avx2")]
3081#[cfg_attr(test, assert_instr(vpsrld, IMM8 = 7))]
3082#[rustc_legacy_const_generics(1)]
3083#[stable(feature = "simd_x86", since = "1.27.0")]
3084pub fn _mm256_srli_epi32<const IMM8: i32>(a: __m256i) -> __m256i {
3085    static_assert_uimm_bits!(IMM8, 8);
3086    unsafe {
3087        if IMM8 >= 32 {
3088            _mm256_setzero_si256()
3089        } else {
3090            transmute(simd_shr(a.as_u32x8(), u32x8::splat(IMM8 as u32)))
3091        }
3092    }
3093}
3094
3095/// Shifts packed 64-bit integers in `a` right by `IMM8` while shifting in
3096/// zeros
3097///
3098/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srli_epi64)
3099#[inline]
3100#[target_feature(enable = "avx2")]
3101#[cfg_attr(test, assert_instr(vpsrlq, IMM8 = 7))]
3102#[rustc_legacy_const_generics(1)]
3103#[stable(feature = "simd_x86", since = "1.27.0")]
3104pub fn _mm256_srli_epi64<const IMM8: i32>(a: __m256i) -> __m256i {
3105    static_assert_uimm_bits!(IMM8, 8);
3106    unsafe {
3107        if IMM8 >= 64 {
3108            _mm256_setzero_si256()
3109        } else {
3110            transmute(simd_shr(a.as_u64x4(), u64x4::splat(IMM8 as u64)))
3111        }
3112    }
3113}
3114
3115/// Shifts packed 32-bit integers in `a` right by the amount specified by
3116/// the corresponding element in `count` while shifting in zeros,
3117///
3118/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srlv_epi32)
3119#[inline]
3120#[target_feature(enable = "avx2")]
3121#[cfg_attr(test, assert_instr(vpsrlvd))]
3122#[stable(feature = "simd_x86", since = "1.27.0")]
3123pub fn _mm_srlv_epi32(a: __m128i, count: __m128i) -> __m128i {
3124    unsafe {
3125        let count = count.as_u32x4();
3126        let no_overflow: u32x4 = simd_lt(count, u32x4::splat(u32::BITS));
3127        let count = simd_select(no_overflow, count, u32x4::ZERO);
3128        simd_select(no_overflow, simd_shr(a.as_u32x4(), count), u32x4::ZERO).as_m128i()
3129    }
3130}
3131
3132/// Shifts packed 32-bit integers in `a` right by the amount specified by
3133/// the corresponding element in `count` while shifting in zeros,
3134///
3135/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srlv_epi32)
3136#[inline]
3137#[target_feature(enable = "avx2")]
3138#[cfg_attr(test, assert_instr(vpsrlvd))]
3139#[stable(feature = "simd_x86", since = "1.27.0")]
3140pub fn _mm256_srlv_epi32(a: __m256i, count: __m256i) -> __m256i {
3141    unsafe {
3142        let count = count.as_u32x8();
3143        let no_overflow: u32x8 = simd_lt(count, u32x8::splat(u32::BITS));
3144        let count = simd_select(no_overflow, count, u32x8::ZERO);
3145        simd_select(no_overflow, simd_shr(a.as_u32x8(), count), u32x8::ZERO).as_m256i()
3146    }
3147}
3148
3149/// Shifts packed 64-bit integers in `a` right by the amount specified by
3150/// the corresponding element in `count` while shifting in zeros,
3151///
3152/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srlv_epi64)
3153#[inline]
3154#[target_feature(enable = "avx2")]
3155#[cfg_attr(test, assert_instr(vpsrlvq))]
3156#[stable(feature = "simd_x86", since = "1.27.0")]
3157pub fn _mm_srlv_epi64(a: __m128i, count: __m128i) -> __m128i {
3158    unsafe {
3159        let count = count.as_u64x2();
3160        let no_overflow: u64x2 = simd_lt(count, u64x2::splat(u64::BITS as u64));
3161        let count = simd_select(no_overflow, count, u64x2::ZERO);
3162        simd_select(no_overflow, simd_shr(a.as_u64x2(), count), u64x2::ZERO).as_m128i()
3163    }
3164}
3165
3166/// Shifts packed 64-bit integers in `a` right by the amount specified by
3167/// the corresponding element in `count` while shifting in zeros,
3168///
3169/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_srlv_epi64)
3170#[inline]
3171#[target_feature(enable = "avx2")]
3172#[cfg_attr(test, assert_instr(vpsrlvq))]
3173#[stable(feature = "simd_x86", since = "1.27.0")]
3174pub fn _mm256_srlv_epi64(a: __m256i, count: __m256i) -> __m256i {
3175    unsafe {
3176        let count = count.as_u64x4();
3177        let no_overflow: u64x4 = simd_lt(count, u64x4::splat(u64::BITS as u64));
3178        let count = simd_select(no_overflow, count, u64x4::ZERO);
3179        simd_select(no_overflow, simd_shr(a.as_u64x4(), count), u64x4::ZERO).as_m256i()
3180    }
3181}
3182
3183/// Load 256-bits of integer data from memory into dst using a non-temporal memory hint. mem_addr
3184/// must be aligned on a 32-byte boundary or a general-protection exception may be generated. To
3185/// minimize caching, the data is flagged as non-temporal (unlikely to be used again soon)
3186///
3187/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_stream_load_si256)
3188#[inline]
3189#[target_feature(enable = "avx2")]
3190#[cfg_attr(test, assert_instr(vmovntdqa))]
3191#[stable(feature = "simd_x86_updates", since = "1.82.0")]
3192pub unsafe fn _mm256_stream_load_si256(mem_addr: *const __m256i) -> __m256i {
3193    let dst: __m256i;
3194    crate::arch::asm!(
3195        vpl!("vmovntdqa {a}"),
3196        a = out(ymm_reg) dst,
3197        p = in(reg) mem_addr,
3198        options(pure, readonly, nostack, preserves_flags),
3199    );
3200    dst
3201}
3202
3203/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in `a`
3204///
3205/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi16)
3206#[inline]
3207#[target_feature(enable = "avx2")]
3208#[cfg_attr(test, assert_instr(vpsubw))]
3209#[stable(feature = "simd_x86", since = "1.27.0")]
3210pub fn _mm256_sub_epi16(a: __m256i, b: __m256i) -> __m256i {
3211    unsafe { transmute(simd_sub(a.as_i16x16(), b.as_i16x16())) }
3212}
3213
3214/// Subtract packed 32-bit integers in `b` from packed 32-bit integers in `a`
3215///
3216/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi32)
3217#[inline]
3218#[target_feature(enable = "avx2")]
3219#[cfg_attr(test, assert_instr(vpsubd))]
3220#[stable(feature = "simd_x86", since = "1.27.0")]
3221pub fn _mm256_sub_epi32(a: __m256i, b: __m256i) -> __m256i {
3222    unsafe { transmute(simd_sub(a.as_i32x8(), b.as_i32x8())) }
3223}
3224
3225/// Subtract packed 64-bit integers in `b` from packed 64-bit integers in `a`
3226///
3227/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi64)
3228#[inline]
3229#[target_feature(enable = "avx2")]
3230#[cfg_attr(test, assert_instr(vpsubq))]
3231#[stable(feature = "simd_x86", since = "1.27.0")]
3232pub fn _mm256_sub_epi64(a: __m256i, b: __m256i) -> __m256i {
3233    unsafe { transmute(simd_sub(a.as_i64x4(), b.as_i64x4())) }
3234}
3235
3236/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in `a`
3237///
3238/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_sub_epi8)
3239#[inline]
3240#[target_feature(enable = "avx2")]
3241#[cfg_attr(test, assert_instr(vpsubb))]
3242#[stable(feature = "simd_x86", since = "1.27.0")]
3243pub fn _mm256_sub_epi8(a: __m256i, b: __m256i) -> __m256i {
3244    unsafe { transmute(simd_sub(a.as_i8x32(), b.as_i8x32())) }
3245}
3246
3247/// Subtract packed 16-bit integers in `b` from packed 16-bit integers in
3248/// `a` using saturation.
3249///
3250/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epi16)
3251#[inline]
3252#[target_feature(enable = "avx2")]
3253#[cfg_attr(test, assert_instr(vpsubsw))]
3254#[stable(feature = "simd_x86", since = "1.27.0")]
3255pub fn _mm256_subs_epi16(a: __m256i, b: __m256i) -> __m256i {
3256    unsafe { transmute(simd_saturating_sub(a.as_i16x16(), b.as_i16x16())) }
3257}
3258
3259/// Subtract packed 8-bit integers in `b` from packed 8-bit integers in
3260/// `a` using saturation.
3261///
3262/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epi8)
3263#[inline]
3264#[target_feature(enable = "avx2")]
3265#[cfg_attr(test, assert_instr(vpsubsb))]
3266#[stable(feature = "simd_x86", since = "1.27.0")]
3267pub fn _mm256_subs_epi8(a: __m256i, b: __m256i) -> __m256i {
3268    unsafe { transmute(simd_saturating_sub(a.as_i8x32(), b.as_i8x32())) }
3269}
3270
3271/// Subtract packed unsigned 16-bit integers in `b` from packed 16-bit
3272/// integers in `a` using saturation.
3273///
3274/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epu16)
3275#[inline]
3276#[target_feature(enable = "avx2")]
3277#[cfg_attr(test, assert_instr(vpsubusw))]
3278#[stable(feature = "simd_x86", since = "1.27.0")]
3279pub fn _mm256_subs_epu16(a: __m256i, b: __m256i) -> __m256i {
3280    unsafe { transmute(simd_saturating_sub(a.as_u16x16(), b.as_u16x16())) }
3281}
3282
3283/// Subtract packed unsigned 8-bit integers in `b` from packed 8-bit
3284/// integers in `a` using saturation.
3285///
3286/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_subs_epu8)
3287#[inline]
3288#[target_feature(enable = "avx2")]
3289#[cfg_attr(test, assert_instr(vpsubusb))]
3290#[stable(feature = "simd_x86", since = "1.27.0")]
3291pub fn _mm256_subs_epu8(a: __m256i, b: __m256i) -> __m256i {
3292    unsafe { transmute(simd_saturating_sub(a.as_u8x32(), b.as_u8x32())) }
3293}
3294
3295/// Unpacks and interleave 8-bit integers from the high half of each
3296/// 128-bit lane in `a` and `b`.
3297///
3298/// ```rust
3299/// #[cfg(target_arch = "x86")]
3300/// use std::arch::x86::*;
3301/// #[cfg(target_arch = "x86_64")]
3302/// use std::arch::x86_64::*;
3303///
3304/// # fn main() {
3305/// #     if is_x86_feature_detected!("avx2") {
3306/// #         #[target_feature(enable = "avx2")]
3307/// #         unsafe fn worker() {
3308/// let a = _mm256_setr_epi8(
3309///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
3310///     20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3311/// );
3312/// let b = _mm256_setr_epi8(
3313///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3314///     -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
3315///     -30, -31,
3316/// );
3317///
3318/// let c = _mm256_unpackhi_epi8(a, b);
3319///
3320/// let expected = _mm256_setr_epi8(
3321///     8, -8, 9, -9, 10, -10, 11, -11, 12, -12, 13, -13, 14, -14, 15, -15,
3322///     24, -24, 25, -25, 26, -26, 27, -27, 28, -28, 29, -29, 30, -30, 31,
3323///     -31,
3324/// );
3325/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3326///
3327/// #         }
3328/// #         unsafe { worker(); }
3329/// #     }
3330/// # }
3331/// ```
3332///
3333/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi8)
3334#[inline]
3335#[target_feature(enable = "avx2")]
3336#[cfg_attr(test, assert_instr(vpunpckhbw))]
3337#[stable(feature = "simd_x86", since = "1.27.0")]
3338pub fn _mm256_unpackhi_epi8(a: __m256i, b: __m256i) -> __m256i {
3339    unsafe {
3340        #[rustfmt::skip]
3341        let r: i8x32 = simd_shuffle!(a.as_i8x32(), b.as_i8x32(), [
3342                8, 40, 9, 41, 10, 42, 11, 43,
3343                12, 44, 13, 45, 14, 46, 15, 47,
3344                24, 56, 25, 57, 26, 58, 27, 59,
3345                28, 60, 29, 61, 30, 62, 31, 63,
3346        ]);
3347        transmute(r)
3348    }
3349}
3350
3351/// Unpacks and interleave 8-bit integers from the low half of each
3352/// 128-bit lane of `a` and `b`.
3353///
3354/// ```rust
3355/// #[cfg(target_arch = "x86")]
3356/// use std::arch::x86::*;
3357/// #[cfg(target_arch = "x86_64")]
3358/// use std::arch::x86_64::*;
3359///
3360/// # fn main() {
3361/// #     if is_x86_feature_detected!("avx2") {
3362/// #         #[target_feature(enable = "avx2")]
3363/// #         unsafe fn worker() {
3364/// let a = _mm256_setr_epi8(
3365///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
3366///     20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
3367/// );
3368/// let b = _mm256_setr_epi8(
3369///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3370///     -16, -17, -18, -19, -20, -21, -22, -23, -24, -25, -26, -27, -28, -29,
3371///     -30, -31,
3372/// );
3373///
3374/// let c = _mm256_unpacklo_epi8(a, b);
3375///
3376/// let expected = _mm256_setr_epi8(
3377///     0, 0, 1, -1, 2, -2, 3, -3, 4, -4, 5, -5, 6, -6, 7, -7, 16, -16, 17,
3378///     -17, 18, -18, 19, -19, 20, -20, 21, -21, 22, -22, 23, -23,
3379/// );
3380/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3381///
3382/// #         }
3383/// #         unsafe { worker(); }
3384/// #     }
3385/// # }
3386/// ```
3387///
3388/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi8)
3389#[inline]
3390#[target_feature(enable = "avx2")]
3391#[cfg_attr(test, assert_instr(vpunpcklbw))]
3392#[stable(feature = "simd_x86", since = "1.27.0")]
3393pub fn _mm256_unpacklo_epi8(a: __m256i, b: __m256i) -> __m256i {
3394    unsafe {
3395        #[rustfmt::skip]
3396        let r: i8x32 = simd_shuffle!(a.as_i8x32(), b.as_i8x32(), [
3397            0, 32, 1, 33, 2, 34, 3, 35,
3398            4, 36, 5, 37, 6, 38, 7, 39,
3399            16, 48, 17, 49, 18, 50, 19, 51,
3400            20, 52, 21, 53, 22, 54, 23, 55,
3401        ]);
3402        transmute(r)
3403    }
3404}
3405
3406/// Unpacks and interleave 16-bit integers from the high half of each
3407/// 128-bit lane of `a` and `b`.
3408///
3409/// ```rust
3410/// #[cfg(target_arch = "x86")]
3411/// use std::arch::x86::*;
3412/// #[cfg(target_arch = "x86_64")]
3413/// use std::arch::x86_64::*;
3414///
3415/// # fn main() {
3416/// #     if is_x86_feature_detected!("avx2") {
3417/// #         #[target_feature(enable = "avx2")]
3418/// #         unsafe fn worker() {
3419/// let a = _mm256_setr_epi16(
3420///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3421/// );
3422/// let b = _mm256_setr_epi16(
3423///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3424/// );
3425///
3426/// let c = _mm256_unpackhi_epi16(a, b);
3427///
3428/// let expected = _mm256_setr_epi16(
3429///     4, -4, 5, -5, 6, -6, 7, -7, 12, -12, 13, -13, 14, -14, 15, -15,
3430/// );
3431/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3432///
3433/// #         }
3434/// #         unsafe { worker(); }
3435/// #     }
3436/// # }
3437/// ```
3438///
3439/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi16)
3440#[inline]
3441#[target_feature(enable = "avx2")]
3442#[cfg_attr(test, assert_instr(vpunpckhwd))]
3443#[stable(feature = "simd_x86", since = "1.27.0")]
3444pub fn _mm256_unpackhi_epi16(a: __m256i, b: __m256i) -> __m256i {
3445    unsafe {
3446        let r: i16x16 = simd_shuffle!(
3447            a.as_i16x16(),
3448            b.as_i16x16(),
3449            [4, 20, 5, 21, 6, 22, 7, 23, 12, 28, 13, 29, 14, 30, 15, 31],
3450        );
3451        transmute(r)
3452    }
3453}
3454
3455/// Unpacks and interleave 16-bit integers from the low half of each
3456/// 128-bit lane of `a` and `b`.
3457///
3458/// ```rust
3459/// #[cfg(target_arch = "x86")]
3460/// use std::arch::x86::*;
3461/// #[cfg(target_arch = "x86_64")]
3462/// use std::arch::x86_64::*;
3463///
3464/// # fn main() {
3465/// #     if is_x86_feature_detected!("avx2") {
3466/// #         #[target_feature(enable = "avx2")]
3467/// #         unsafe fn worker() {
3468///
3469/// let a = _mm256_setr_epi16(
3470///     0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
3471/// );
3472/// let b = _mm256_setr_epi16(
3473///     0, -1, -2, -3, -4, -5, -6, -7, -8, -9, -10, -11, -12, -13, -14, -15,
3474/// );
3475///
3476/// let c = _mm256_unpacklo_epi16(a, b);
3477///
3478/// let expected = _mm256_setr_epi16(
3479///     0, 0, 1, -1, 2, -2, 3, -3, 8, -8, 9, -9, 10, -10, 11, -11,
3480/// );
3481/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3482///
3483/// #         }
3484/// #         unsafe { worker(); }
3485/// #     }
3486/// # }
3487/// ```
3488///
3489/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi16)
3490#[inline]
3491#[target_feature(enable = "avx2")]
3492#[cfg_attr(test, assert_instr(vpunpcklwd))]
3493#[stable(feature = "simd_x86", since = "1.27.0")]
3494pub fn _mm256_unpacklo_epi16(a: __m256i, b: __m256i) -> __m256i {
3495    unsafe {
3496        let r: i16x16 = simd_shuffle!(
3497            a.as_i16x16(),
3498            b.as_i16x16(),
3499            [0, 16, 1, 17, 2, 18, 3, 19, 8, 24, 9, 25, 10, 26, 11, 27],
3500        );
3501        transmute(r)
3502    }
3503}
3504
3505/// Unpacks and interleave 32-bit integers from the high half of each
3506/// 128-bit lane of `a` and `b`.
3507///
3508/// ```rust
3509/// #[cfg(target_arch = "x86")]
3510/// use std::arch::x86::*;
3511/// #[cfg(target_arch = "x86_64")]
3512/// use std::arch::x86_64::*;
3513///
3514/// # fn main() {
3515/// #     if is_x86_feature_detected!("avx2") {
3516/// #         #[target_feature(enable = "avx2")]
3517/// #         unsafe fn worker() {
3518/// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
3519/// let b = _mm256_setr_epi32(0, -1, -2, -3, -4, -5, -6, -7);
3520///
3521/// let c = _mm256_unpackhi_epi32(a, b);
3522///
3523/// let expected = _mm256_setr_epi32(2, -2, 3, -3, 6, -6, 7, -7);
3524/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3525///
3526/// #         }
3527/// #         unsafe { worker(); }
3528/// #     }
3529/// # }
3530/// ```
3531///
3532/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi32)
3533#[inline]
3534#[target_feature(enable = "avx2")]
3535#[cfg_attr(test, assert_instr(vunpckhps))]
3536#[stable(feature = "simd_x86", since = "1.27.0")]
3537pub fn _mm256_unpackhi_epi32(a: __m256i, b: __m256i) -> __m256i {
3538    unsafe {
3539        let r: i32x8 = simd_shuffle!(a.as_i32x8(), b.as_i32x8(), [2, 10, 3, 11, 6, 14, 7, 15]);
3540        transmute(r)
3541    }
3542}
3543
3544/// Unpacks and interleave 32-bit integers from the low half of each
3545/// 128-bit lane of `a` and `b`.
3546///
3547/// ```rust
3548/// #[cfg(target_arch = "x86")]
3549/// use std::arch::x86::*;
3550/// #[cfg(target_arch = "x86_64")]
3551/// use std::arch::x86_64::*;
3552///
3553/// # fn main() {
3554/// #     if is_x86_feature_detected!("avx2") {
3555/// #         #[target_feature(enable = "avx2")]
3556/// #         unsafe fn worker() {
3557/// let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
3558/// let b = _mm256_setr_epi32(0, -1, -2, -3, -4, -5, -6, -7);
3559///
3560/// let c = _mm256_unpacklo_epi32(a, b);
3561///
3562/// let expected = _mm256_setr_epi32(0, 0, 1, -1, 4, -4, 5, -5);
3563/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3564///
3565/// #         }
3566/// #         unsafe { worker(); }
3567/// #     }
3568/// # }
3569/// ```
3570///
3571/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi32)
3572#[inline]
3573#[target_feature(enable = "avx2")]
3574#[cfg_attr(test, assert_instr(vunpcklps))]
3575#[stable(feature = "simd_x86", since = "1.27.0")]
3576pub fn _mm256_unpacklo_epi32(a: __m256i, b: __m256i) -> __m256i {
3577    unsafe {
3578        let r: i32x8 = simd_shuffle!(a.as_i32x8(), b.as_i32x8(), [0, 8, 1, 9, 4, 12, 5, 13]);
3579        transmute(r)
3580    }
3581}
3582
3583/// Unpacks and interleave 64-bit integers from the high half of each
3584/// 128-bit lane of `a` and `b`.
3585///
3586/// ```rust
3587/// #[cfg(target_arch = "x86")]
3588/// use std::arch::x86::*;
3589/// #[cfg(target_arch = "x86_64")]
3590/// use std::arch::x86_64::*;
3591///
3592/// # fn main() {
3593/// #     if is_x86_feature_detected!("avx2") {
3594/// #         #[target_feature(enable = "avx2")]
3595/// #         unsafe fn worker() {
3596/// let a = _mm256_setr_epi64x(0, 1, 2, 3);
3597/// let b = _mm256_setr_epi64x(0, -1, -2, -3);
3598///
3599/// let c = _mm256_unpackhi_epi64(a, b);
3600///
3601/// let expected = _mm256_setr_epi64x(1, -1, 3, -3);
3602/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3603///
3604/// #         }
3605/// #         unsafe { worker(); }
3606/// #     }
3607/// # }
3608/// ```
3609///
3610/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpackhi_epi64)
3611#[inline]
3612#[target_feature(enable = "avx2")]
3613#[cfg_attr(test, assert_instr(vunpckhpd))]
3614#[stable(feature = "simd_x86", since = "1.27.0")]
3615pub fn _mm256_unpackhi_epi64(a: __m256i, b: __m256i) -> __m256i {
3616    unsafe {
3617        let r: i64x4 = simd_shuffle!(a.as_i64x4(), b.as_i64x4(), [1, 5, 3, 7]);
3618        transmute(r)
3619    }
3620}
3621
3622/// Unpacks and interleave 64-bit integers from the low half of each
3623/// 128-bit lane of `a` and `b`.
3624///
3625/// ```rust
3626/// #[cfg(target_arch = "x86")]
3627/// use std::arch::x86::*;
3628/// #[cfg(target_arch = "x86_64")]
3629/// use std::arch::x86_64::*;
3630///
3631/// # fn main() {
3632/// #     if is_x86_feature_detected!("avx2") {
3633/// #         #[target_feature(enable = "avx2")]
3634/// #         unsafe fn worker() {
3635/// let a = _mm256_setr_epi64x(0, 1, 2, 3);
3636/// let b = _mm256_setr_epi64x(0, -1, -2, -3);
3637///
3638/// let c = _mm256_unpacklo_epi64(a, b);
3639///
3640/// let expected = _mm256_setr_epi64x(0, 0, 2, -2);
3641/// assert_eq!(_mm256_movemask_epi8(_mm256_cmpeq_epi8(c, expected)), !0);
3642///
3643/// #         }
3644/// #         unsafe { worker(); }
3645/// #     }
3646/// # }
3647/// ```
3648///
3649/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_unpacklo_epi64)
3650#[inline]
3651#[target_feature(enable = "avx2")]
3652#[cfg_attr(test, assert_instr(vunpcklpd))]
3653#[stable(feature = "simd_x86", since = "1.27.0")]
3654pub fn _mm256_unpacklo_epi64(a: __m256i, b: __m256i) -> __m256i {
3655    unsafe {
3656        let r: i64x4 = simd_shuffle!(a.as_i64x4(), b.as_i64x4(), [0, 4, 2, 6]);
3657        transmute(r)
3658    }
3659}
3660
3661/// Computes the bitwise XOR of 256 bits (representing integer data)
3662/// in `a` and `b`
3663///
3664/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_xor_si256)
3665#[inline]
3666#[target_feature(enable = "avx2")]
3667#[cfg_attr(test, assert_instr(vxorps))]
3668#[stable(feature = "simd_x86", since = "1.27.0")]
3669pub fn _mm256_xor_si256(a: __m256i, b: __m256i) -> __m256i {
3670    unsafe { transmute(simd_xor(a.as_i64x4(), b.as_i64x4())) }
3671}
3672
3673/// Extracts an 8-bit integer from `a`, selected with `INDEX`. Returns a 32-bit
3674/// integer containing the zero-extended integer data.
3675///
3676/// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
3677///
3678/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extract_epi8)
3679#[inline]
3680#[target_feature(enable = "avx2")]
3681// This intrinsic has no corresponding instruction.
3682#[rustc_legacy_const_generics(1)]
3683#[stable(feature = "simd_x86", since = "1.27.0")]
3684pub fn _mm256_extract_epi8<const INDEX: i32>(a: __m256i) -> i32 {
3685    static_assert_uimm_bits!(INDEX, 5);
3686    unsafe { simd_extract!(a.as_u8x32(), INDEX as u32, u8) as i32 }
3687}
3688
3689/// Extracts a 16-bit integer from `a`, selected with `INDEX`. Returns a 32-bit
3690/// integer containing the zero-extended integer data.
3691///
3692/// See [LLVM commit D20468](https://reviews.llvm.org/D20468).
3693///
3694/// [Intel's documentation](https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_extract_epi16)
3695#[inline]
3696#[target_feature(enable = "avx2")]
3697// This intrinsic has no corresponding instruction.
3698#[rustc_legacy_const_generics(1)]
3699#[stable(feature = "simd_x86", since = "1.27.0")]
3700pub fn _mm256_extract_epi16<const INDEX: i32>(a: __m256i) -> i32 {
3701    static_assert_uimm_bits!(INDEX, 4);
3702    unsafe { simd_extract!(a.as_u16x16(), INDEX as u32, u16) as i32 }
3703}
3704
3705#[allow(improper_ctypes)]
3706unsafe extern "C" {
3707    #[link_name = "llvm.x86.avx2.phadd.sw"]
3708    fn phaddsw(a: i16x16, b: i16x16) -> i16x16;
3709    #[link_name = "llvm.x86.avx2.phsub.sw"]
3710    fn phsubsw(a: i16x16, b: i16x16) -> i16x16;
3711    #[link_name = "llvm.x86.avx2.pmadd.wd"]
3712    fn pmaddwd(a: i16x16, b: i16x16) -> i32x8;
3713    #[link_name = "llvm.x86.avx2.pmadd.ub.sw"]
3714    fn pmaddubsw(a: u8x32, b: i8x32) -> i16x16;
3715    #[link_name = "llvm.x86.avx2.mpsadbw"]
3716    fn mpsadbw(a: u8x32, b: u8x32, imm8: i8) -> u16x16;
3717    #[link_name = "llvm.x86.avx2.pmul.hr.sw"]
3718    fn pmulhrsw(a: i16x16, b: i16x16) -> i16x16;
3719    #[link_name = "llvm.x86.avx2.packsswb"]
3720    fn packsswb(a: i16x16, b: i16x16) -> i8x32;
3721    #[link_name = "llvm.x86.avx2.packssdw"]
3722    fn packssdw(a: i32x8, b: i32x8) -> i16x16;
3723    #[link_name = "llvm.x86.avx2.packuswb"]
3724    fn packuswb(a: i16x16, b: i16x16) -> u8x32;
3725    #[link_name = "llvm.x86.avx2.packusdw"]
3726    fn packusdw(a: i32x8, b: i32x8) -> u16x16;
3727    #[link_name = "llvm.x86.avx2.psad.bw"]
3728    fn psadbw(a: u8x32, b: u8x32) -> u64x4;
3729    #[link_name = "llvm.x86.avx2.psign.b"]
3730    fn psignb(a: i8x32, b: i8x32) -> i8x32;
3731    #[link_name = "llvm.x86.avx2.psign.w"]
3732    fn psignw(a: i16x16, b: i16x16) -> i16x16;
3733    #[link_name = "llvm.x86.avx2.psign.d"]
3734    fn psignd(a: i32x8, b: i32x8) -> i32x8;
3735    #[link_name = "llvm.x86.avx2.psll.w"]
3736    fn psllw(a: i16x16, count: i16x8) -> i16x16;
3737    #[link_name = "llvm.x86.avx2.psll.d"]
3738    fn pslld(a: i32x8, count: i32x4) -> i32x8;
3739    #[link_name = "llvm.x86.avx2.psll.q"]
3740    fn psllq(a: i64x4, count: i64x2) -> i64x4;
3741    #[link_name = "llvm.x86.avx2.psra.w"]
3742    fn psraw(a: i16x16, count: i16x8) -> i16x16;
3743    #[link_name = "llvm.x86.avx2.psra.d"]
3744    fn psrad(a: i32x8, count: i32x4) -> i32x8;
3745    #[link_name = "llvm.x86.avx2.psrl.w"]
3746    fn psrlw(a: i16x16, count: i16x8) -> i16x16;
3747    #[link_name = "llvm.x86.avx2.psrl.d"]
3748    fn psrld(a: i32x8, count: i32x4) -> i32x8;
3749    #[link_name = "llvm.x86.avx2.psrl.q"]
3750    fn psrlq(a: i64x4, count: i64x2) -> i64x4;
3751    #[link_name = "llvm.x86.avx2.pshuf.b"]
3752    fn pshufb(a: u8x32, b: u8x32) -> u8x32;
3753    #[link_name = "llvm.x86.avx2.permd"]
3754    fn permd(a: u32x8, b: u32x8) -> u32x8;
3755    #[link_name = "llvm.x86.avx2.permps"]
3756    fn permps(a: __m256, b: i32x8) -> __m256;
3757    #[link_name = "llvm.x86.avx2.gather.d.d"]
3758    fn pgatherdd(src: i32x4, slice: *const i8, offsets: i32x4, mask: i32x4, scale: i8) -> i32x4;
3759    #[link_name = "llvm.x86.avx2.gather.d.d.256"]
3760    fn vpgatherdd(src: i32x8, slice: *const i8, offsets: i32x8, mask: i32x8, scale: i8) -> i32x8;
3761    #[link_name = "llvm.x86.avx2.gather.d.q"]
3762    fn pgatherdq(src: i64x2, slice: *const i8, offsets: i32x4, mask: i64x2, scale: i8) -> i64x2;
3763    #[link_name = "llvm.x86.avx2.gather.d.q.256"]
3764    fn vpgatherdq(src: i64x4, slice: *const i8, offsets: i32x4, mask: i64x4, scale: i8) -> i64x4;
3765    #[link_name = "llvm.x86.avx2.gather.q.d"]
3766    fn pgatherqd(src: i32x4, slice: *const i8, offsets: i64x2, mask: i32x4, scale: i8) -> i32x4;
3767    #[link_name = "llvm.x86.avx2.gather.q.d.256"]
3768    fn vpgatherqd(src: i32x4, slice: *const i8, offsets: i64x4, mask: i32x4, scale: i8) -> i32x4;
3769    #[link_name = "llvm.x86.avx2.gather.q.q"]
3770    fn pgatherqq(src: i64x2, slice: *const i8, offsets: i64x2, mask: i64x2, scale: i8) -> i64x2;
3771    #[link_name = "llvm.x86.avx2.gather.q.q.256"]
3772    fn vpgatherqq(src: i64x4, slice: *const i8, offsets: i64x4, mask: i64x4, scale: i8) -> i64x4;
3773    #[link_name = "llvm.x86.avx2.gather.d.pd"]
3774    fn pgatherdpd(
3775        src: __m128d,
3776        slice: *const i8,
3777        offsets: i32x4,
3778        mask: __m128d,
3779        scale: i8,
3780    ) -> __m128d;
3781    #[link_name = "llvm.x86.avx2.gather.d.pd.256"]
3782    fn vpgatherdpd(
3783        src: __m256d,
3784        slice: *const i8,
3785        offsets: i32x4,
3786        mask: __m256d,
3787        scale: i8,
3788    ) -> __m256d;
3789    #[link_name = "llvm.x86.avx2.gather.q.pd"]
3790    fn pgatherqpd(
3791        src: __m128d,
3792        slice: *const i8,
3793        offsets: i64x2,
3794        mask: __m128d,
3795        scale: i8,
3796    ) -> __m128d;
3797    #[link_name = "llvm.x86.avx2.gather.q.pd.256"]
3798    fn vpgatherqpd(
3799        src: __m256d,
3800        slice: *const i8,
3801        offsets: i64x4,
3802        mask: __m256d,
3803        scale: i8,
3804    ) -> __m256d;
3805    #[link_name = "llvm.x86.avx2.gather.d.ps"]
3806    fn pgatherdps(src: __m128, slice: *const i8, offsets: i32x4, mask: __m128, scale: i8)
3807    -> __m128;
3808    #[link_name = "llvm.x86.avx2.gather.d.ps.256"]
3809    fn vpgatherdps(
3810        src: __m256,
3811        slice: *const i8,
3812        offsets: i32x8,
3813        mask: __m256,
3814        scale: i8,
3815    ) -> __m256;
3816    #[link_name = "llvm.x86.avx2.gather.q.ps"]
3817    fn pgatherqps(src: __m128, slice: *const i8, offsets: i64x2, mask: __m128, scale: i8)
3818    -> __m128;
3819    #[link_name = "llvm.x86.avx2.gather.q.ps.256"]
3820    fn vpgatherqps(
3821        src: __m128,
3822        slice: *const i8,
3823        offsets: i64x4,
3824        mask: __m128,
3825        scale: i8,
3826    ) -> __m128;
3827}
3828
3829#[cfg(test)]
3830mod tests {
3831
3832    use stdarch_test::simd_test;
3833
3834    use crate::core_arch::x86::*;
3835
3836    #[simd_test(enable = "avx2")]
3837    unsafe fn test_mm256_abs_epi32() {
3838        #[rustfmt::skip]
3839        let a = _mm256_setr_epi32(
3840            0, 1, -1, i32::MAX,
3841            i32::MIN, 100, -100, -32,
3842        );
3843        let r = _mm256_abs_epi32(a);
3844        #[rustfmt::skip]
3845        let e = _mm256_setr_epi32(
3846            0, 1, 1, i32::MAX,
3847            i32::MAX.wrapping_add(1), 100, 100, 32,
3848        );
3849        assert_eq_m256i(r, e);
3850    }
3851
3852    #[simd_test(enable = "avx2")]
3853    unsafe fn test_mm256_abs_epi16() {
3854        #[rustfmt::skip]
3855        let a = _mm256_setr_epi16(
3856            0,  1, -1, 2, -2, 3, -3, 4,
3857            -4, 5, -5, i16::MAX, i16::MIN, 100, -100, -32,
3858        );
3859        let r = _mm256_abs_epi16(a);
3860        #[rustfmt::skip]
3861        let e = _mm256_setr_epi16(
3862            0, 1, 1, 2, 2, 3, 3, 4,
3863            4, 5, 5, i16::MAX, i16::MAX.wrapping_add(1), 100, 100, 32,
3864        );
3865        assert_eq_m256i(r, e);
3866    }
3867
3868    #[simd_test(enable = "avx2")]
3869    unsafe fn test_mm256_abs_epi8() {
3870        #[rustfmt::skip]
3871        let a = _mm256_setr_epi8(
3872            0, 1, -1, 2, -2, 3, -3, 4,
3873            -4, 5, -5, i8::MAX, i8::MIN, 100, -100, -32,
3874            0, 1, -1, 2, -2, 3, -3, 4,
3875            -4, 5, -5, i8::MAX, i8::MIN, 100, -100, -32,
3876        );
3877        let r = _mm256_abs_epi8(a);
3878        #[rustfmt::skip]
3879        let e = _mm256_setr_epi8(
3880            0, 1, 1, 2, 2, 3, 3, 4,
3881            4, 5, 5, i8::MAX, i8::MAX.wrapping_add(1), 100, 100, 32,
3882            0, 1, 1, 2, 2, 3, 3, 4,
3883            4, 5, 5, i8::MAX, i8::MAX.wrapping_add(1), 100, 100, 32,
3884        );
3885        assert_eq_m256i(r, e);
3886    }
3887
3888    #[simd_test(enable = "avx2")]
3889    unsafe fn test_mm256_add_epi64() {
3890        let a = _mm256_setr_epi64x(-10, 0, 100, 1_000_000_000);
3891        let b = _mm256_setr_epi64x(-1, 0, 1, 2);
3892        let r = _mm256_add_epi64(a, b);
3893        let e = _mm256_setr_epi64x(-11, 0, 101, 1_000_000_002);
3894        assert_eq_m256i(r, e);
3895    }
3896
3897    #[simd_test(enable = "avx2")]
3898    unsafe fn test_mm256_add_epi32() {
3899        let a = _mm256_setr_epi32(-1, 0, 1, 2, 3, 4, 5, 6);
3900        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
3901        let r = _mm256_add_epi32(a, b);
3902        let e = _mm256_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14);
3903        assert_eq_m256i(r, e);
3904    }
3905
3906    #[simd_test(enable = "avx2")]
3907    unsafe fn test_mm256_add_epi16() {
3908        #[rustfmt::skip]
3909        let a = _mm256_setr_epi16(
3910            0, 1, 2, 3, 4, 5, 6, 7,
3911            8, 9, 10, 11, 12, 13, 14, 15,
3912        );
3913        #[rustfmt::skip]
3914        let b = _mm256_setr_epi16(
3915            0, 1, 2, 3, 4, 5, 6, 7,
3916            8, 9, 10, 11, 12, 13, 14, 15,
3917        );
3918        let r = _mm256_add_epi16(a, b);
3919        #[rustfmt::skip]
3920        let e = _mm256_setr_epi16(
3921            0, 2, 4, 6, 8, 10, 12, 14,
3922            16, 18, 20, 22, 24, 26, 28, 30,
3923        );
3924        assert_eq_m256i(r, e);
3925    }
3926
3927    #[simd_test(enable = "avx2")]
3928    unsafe fn test_mm256_add_epi8() {
3929        #[rustfmt::skip]
3930        let a = _mm256_setr_epi8(
3931            0, 1, 2, 3, 4, 5, 6, 7,
3932            8, 9, 10, 11, 12, 13, 14, 15,
3933            16, 17, 18, 19, 20, 21, 22, 23,
3934            24, 25, 26, 27, 28, 29, 30, 31,
3935        );
3936        #[rustfmt::skip]
3937        let b = _mm256_setr_epi8(
3938            0, 1, 2, 3, 4, 5, 6, 7,
3939            8, 9, 10, 11, 12, 13, 14, 15,
3940            16, 17, 18, 19, 20, 21, 22, 23,
3941            24, 25, 26, 27, 28, 29, 30, 31,
3942        );
3943        let r = _mm256_add_epi8(a, b);
3944        #[rustfmt::skip]
3945        let e = _mm256_setr_epi8(
3946            0, 2, 4, 6, 8, 10, 12, 14,
3947            16, 18, 20, 22, 24, 26, 28, 30,
3948            32, 34, 36, 38, 40, 42, 44, 46,
3949            48, 50, 52, 54, 56, 58, 60, 62,
3950        );
3951        assert_eq_m256i(r, e);
3952    }
3953
3954    #[simd_test(enable = "avx2")]
3955    unsafe fn test_mm256_adds_epi8() {
3956        #[rustfmt::skip]
3957        let a = _mm256_setr_epi8(
3958            0, 1, 2, 3, 4, 5, 6, 7,
3959            8, 9, 10, 11, 12, 13, 14, 15,
3960            16, 17, 18, 19, 20, 21, 22, 23,
3961            24, 25, 26, 27, 28, 29, 30, 31,
3962        );
3963        #[rustfmt::skip]
3964        let b = _mm256_setr_epi8(
3965            32, 33, 34, 35, 36, 37, 38, 39,
3966            40, 41, 42, 43, 44, 45, 46, 47,
3967            48, 49, 50, 51, 52, 53, 54, 55,
3968            56, 57, 58, 59, 60, 61, 62, 63,
3969        );
3970        let r = _mm256_adds_epi8(a, b);
3971        #[rustfmt::skip]
3972        let e = _mm256_setr_epi8(
3973            32, 34, 36, 38, 40, 42, 44, 46,
3974            48, 50, 52, 54, 56, 58, 60, 62,
3975            64, 66, 68, 70, 72, 74, 76, 78,
3976            80, 82, 84, 86, 88, 90, 92, 94,
3977        );
3978        assert_eq_m256i(r, e);
3979    }
3980
3981    #[simd_test(enable = "avx2")]
3982    unsafe fn test_mm256_adds_epi8_saturate_positive() {
3983        let a = _mm256_set1_epi8(0x7F);
3984        let b = _mm256_set1_epi8(1);
3985        let r = _mm256_adds_epi8(a, b);
3986        assert_eq_m256i(r, a);
3987    }
3988
3989    #[simd_test(enable = "avx2")]
3990    unsafe fn test_mm256_adds_epi8_saturate_negative() {
3991        let a = _mm256_set1_epi8(-0x80);
3992        let b = _mm256_set1_epi8(-1);
3993        let r = _mm256_adds_epi8(a, b);
3994        assert_eq_m256i(r, a);
3995    }
3996
3997    #[simd_test(enable = "avx2")]
3998    unsafe fn test_mm256_adds_epi16() {
3999        #[rustfmt::skip]
4000        let a = _mm256_setr_epi16(
4001            0, 1, 2, 3, 4, 5, 6, 7,
4002            8, 9, 10, 11, 12, 13, 14, 15,
4003        );
4004        #[rustfmt::skip]
4005        let b = _mm256_setr_epi16(
4006            32, 33, 34, 35, 36, 37, 38, 39,
4007            40, 41, 42, 43, 44, 45, 46, 47,
4008        );
4009        let r = _mm256_adds_epi16(a, b);
4010        #[rustfmt::skip]
4011        let e = _mm256_setr_epi16(
4012            32, 34, 36, 38, 40, 42, 44, 46,
4013            48, 50, 52, 54, 56, 58, 60, 62,
4014        );
4015
4016        assert_eq_m256i(r, e);
4017    }
4018
4019    #[simd_test(enable = "avx2")]
4020    unsafe fn test_mm256_adds_epi16_saturate_positive() {
4021        let a = _mm256_set1_epi16(0x7FFF);
4022        let b = _mm256_set1_epi16(1);
4023        let r = _mm256_adds_epi16(a, b);
4024        assert_eq_m256i(r, a);
4025    }
4026
4027    #[simd_test(enable = "avx2")]
4028    unsafe fn test_mm256_adds_epi16_saturate_negative() {
4029        let a = _mm256_set1_epi16(-0x8000);
4030        let b = _mm256_set1_epi16(-1);
4031        let r = _mm256_adds_epi16(a, b);
4032        assert_eq_m256i(r, a);
4033    }
4034
4035    #[simd_test(enable = "avx2")]
4036    unsafe fn test_mm256_adds_epu8() {
4037        #[rustfmt::skip]
4038        let a = _mm256_setr_epi8(
4039            0, 1, 2, 3, 4, 5, 6, 7,
4040            8, 9, 10, 11, 12, 13, 14, 15,
4041            16, 17, 18, 19, 20, 21, 22, 23,
4042            24, 25, 26, 27, 28, 29, 30, 31,
4043        );
4044        #[rustfmt::skip]
4045        let b = _mm256_setr_epi8(
4046            32, 33, 34, 35, 36, 37, 38, 39,
4047            40, 41, 42, 43, 44, 45, 46, 47,
4048            48, 49, 50, 51, 52, 53, 54, 55,
4049            56, 57, 58, 59, 60, 61, 62, 63,
4050        );
4051        let r = _mm256_adds_epu8(a, b);
4052        #[rustfmt::skip]
4053        let e = _mm256_setr_epi8(
4054            32, 34, 36, 38, 40, 42, 44, 46,
4055            48, 50, 52, 54, 56, 58, 60, 62,
4056            64, 66, 68, 70, 72, 74, 76, 78,
4057            80, 82, 84, 86, 88, 90, 92, 94,
4058        );
4059        assert_eq_m256i(r, e);
4060    }
4061
4062    #[simd_test(enable = "avx2")]
4063    unsafe fn test_mm256_adds_epu8_saturate() {
4064        let a = _mm256_set1_epi8(!0);
4065        let b = _mm256_set1_epi8(1);
4066        let r = _mm256_adds_epu8(a, b);
4067        assert_eq_m256i(r, a);
4068    }
4069
4070    #[simd_test(enable = "avx2")]
4071    unsafe fn test_mm256_adds_epu16() {
4072        #[rustfmt::skip]
4073        let a = _mm256_setr_epi16(
4074            0, 1, 2, 3, 4, 5, 6, 7,
4075            8, 9, 10, 11, 12, 13, 14, 15,
4076        );
4077        #[rustfmt::skip]
4078        let b = _mm256_setr_epi16(
4079            32, 33, 34, 35, 36, 37, 38, 39,
4080            40, 41, 42, 43, 44, 45, 46, 47,
4081        );
4082        let r = _mm256_adds_epu16(a, b);
4083        #[rustfmt::skip]
4084        let e = _mm256_setr_epi16(
4085            32, 34, 36, 38, 40, 42, 44, 46,
4086            48, 50, 52, 54, 56, 58, 60, 62,
4087        );
4088
4089        assert_eq_m256i(r, e);
4090    }
4091
4092    #[simd_test(enable = "avx2")]
4093    unsafe fn test_mm256_adds_epu16_saturate() {
4094        let a = _mm256_set1_epi16(!0);
4095        let b = _mm256_set1_epi16(1);
4096        let r = _mm256_adds_epu16(a, b);
4097        assert_eq_m256i(r, a);
4098    }
4099
4100    #[simd_test(enable = "avx2")]
4101    unsafe fn test_mm256_and_si256() {
4102        let a = _mm256_set1_epi8(5);
4103        let b = _mm256_set1_epi8(3);
4104        let got = _mm256_and_si256(a, b);
4105        assert_eq_m256i(got, _mm256_set1_epi8(1));
4106    }
4107
4108    #[simd_test(enable = "avx2")]
4109    unsafe fn test_mm256_andnot_si256() {
4110        let a = _mm256_set1_epi8(5);
4111        let b = _mm256_set1_epi8(3);
4112        let got = _mm256_andnot_si256(a, b);
4113        assert_eq_m256i(got, _mm256_set1_epi8(2));
4114    }
4115
4116    #[simd_test(enable = "avx2")]
4117    unsafe fn test_mm256_avg_epu8() {
4118        let (a, b) = (_mm256_set1_epi8(3), _mm256_set1_epi8(9));
4119        let r = _mm256_avg_epu8(a, b);
4120        assert_eq_m256i(r, _mm256_set1_epi8(6));
4121    }
4122
4123    #[simd_test(enable = "avx2")]
4124    unsafe fn test_mm256_avg_epu16() {
4125        let (a, b) = (_mm256_set1_epi16(3), _mm256_set1_epi16(9));
4126        let r = _mm256_avg_epu16(a, b);
4127        assert_eq_m256i(r, _mm256_set1_epi16(6));
4128    }
4129
4130    #[simd_test(enable = "avx2")]
4131    unsafe fn test_mm_blend_epi32() {
4132        let (a, b) = (_mm_set1_epi32(3), _mm_set1_epi32(9));
4133        let e = _mm_setr_epi32(9, 3, 3, 3);
4134        let r = _mm_blend_epi32::<0x01>(a, b);
4135        assert_eq_m128i(r, e);
4136
4137        let r = _mm_blend_epi32::<0x0E>(b, a);
4138        assert_eq_m128i(r, e);
4139    }
4140
4141    #[simd_test(enable = "avx2")]
4142    unsafe fn test_mm256_blend_epi32() {
4143        let (a, b) = (_mm256_set1_epi32(3), _mm256_set1_epi32(9));
4144        let e = _mm256_setr_epi32(9, 3, 3, 3, 3, 3, 3, 3);
4145        let r = _mm256_blend_epi32::<0x01>(a, b);
4146        assert_eq_m256i(r, e);
4147
4148        let e = _mm256_setr_epi32(3, 9, 3, 3, 3, 3, 3, 9);
4149        let r = _mm256_blend_epi32::<0x82>(a, b);
4150        assert_eq_m256i(r, e);
4151
4152        let e = _mm256_setr_epi32(3, 3, 9, 9, 9, 9, 9, 3);
4153        let r = _mm256_blend_epi32::<0x7C>(a, b);
4154        assert_eq_m256i(r, e);
4155    }
4156
4157    #[simd_test(enable = "avx2")]
4158    unsafe fn test_mm256_blend_epi16() {
4159        let (a, b) = (_mm256_set1_epi16(3), _mm256_set1_epi16(9));
4160        let e = _mm256_setr_epi16(9, 3, 3, 3, 3, 3, 3, 3, 9, 3, 3, 3, 3, 3, 3, 3);
4161        let r = _mm256_blend_epi16::<0x01>(a, b);
4162        assert_eq_m256i(r, e);
4163
4164        let r = _mm256_blend_epi16::<0xFE>(b, a);
4165        assert_eq_m256i(r, e);
4166    }
4167
4168    #[simd_test(enable = "avx2")]
4169    unsafe fn test_mm256_blendv_epi8() {
4170        let (a, b) = (_mm256_set1_epi8(4), _mm256_set1_epi8(2));
4171        let mask = _mm256_insert_epi8::<2>(_mm256_set1_epi8(0), -1);
4172        let e = _mm256_insert_epi8::<2>(_mm256_set1_epi8(4), 2);
4173        let r = _mm256_blendv_epi8(a, b, mask);
4174        assert_eq_m256i(r, e);
4175    }
4176
4177    #[simd_test(enable = "avx2")]
4178    unsafe fn test_mm_broadcastb_epi8() {
4179        let a = _mm_insert_epi8::<0>(_mm_set1_epi8(0x00), 0x2a);
4180        let res = _mm_broadcastb_epi8(a);
4181        assert_eq_m128i(res, _mm_set1_epi8(0x2a));
4182    }
4183
4184    #[simd_test(enable = "avx2")]
4185    unsafe fn test_mm256_broadcastb_epi8() {
4186        let a = _mm_insert_epi8::<0>(_mm_set1_epi8(0x00), 0x2a);
4187        let res = _mm256_broadcastb_epi8(a);
4188        assert_eq_m256i(res, _mm256_set1_epi8(0x2a));
4189    }
4190
4191    #[simd_test(enable = "avx2")]
4192    unsafe fn test_mm_broadcastd_epi32() {
4193        let a = _mm_setr_epi32(0x2a, 0x8000000, 0, 0);
4194        let res = _mm_broadcastd_epi32(a);
4195        assert_eq_m128i(res, _mm_set1_epi32(0x2a));
4196    }
4197
4198    #[simd_test(enable = "avx2")]
4199    unsafe fn test_mm256_broadcastd_epi32() {
4200        let a = _mm_setr_epi32(0x2a, 0x8000000, 0, 0);
4201        let res = _mm256_broadcastd_epi32(a);
4202        assert_eq_m256i(res, _mm256_set1_epi32(0x2a));
4203    }
4204
4205    #[simd_test(enable = "avx2")]
4206    unsafe fn test_mm_broadcastq_epi64() {
4207        let a = _mm_setr_epi64x(0x1ffffffff, 0);
4208        let res = _mm_broadcastq_epi64(a);
4209        assert_eq_m128i(res, _mm_set1_epi64x(0x1ffffffff));
4210    }
4211
4212    #[simd_test(enable = "avx2")]
4213    unsafe fn test_mm256_broadcastq_epi64() {
4214        let a = _mm_setr_epi64x(0x1ffffffff, 0);
4215        let res = _mm256_broadcastq_epi64(a);
4216        assert_eq_m256i(res, _mm256_set1_epi64x(0x1ffffffff));
4217    }
4218
4219    #[simd_test(enable = "avx2")]
4220    unsafe fn test_mm_broadcastsd_pd() {
4221        let a = _mm_setr_pd(6.88, 3.44);
4222        let res = _mm_broadcastsd_pd(a);
4223        assert_eq_m128d(res, _mm_set1_pd(6.88));
4224    }
4225
4226    #[simd_test(enable = "avx2")]
4227    unsafe fn test_mm256_broadcastsd_pd() {
4228        let a = _mm_setr_pd(6.88, 3.44);
4229        let res = _mm256_broadcastsd_pd(a);
4230        assert_eq_m256d(res, _mm256_set1_pd(6.88f64));
4231    }
4232
4233    #[simd_test(enable = "avx2")]
4234    unsafe fn test_mm_broadcastsi128_si256() {
4235        let a = _mm_setr_epi64x(0x0987654321012334, 0x5678909876543210);
4236        let res = _mm_broadcastsi128_si256(a);
4237        let retval = _mm256_setr_epi64x(
4238            0x0987654321012334,
4239            0x5678909876543210,
4240            0x0987654321012334,
4241            0x5678909876543210,
4242        );
4243        assert_eq_m256i(res, retval);
4244    }
4245
4246    #[simd_test(enable = "avx2")]
4247    unsafe fn test_mm256_broadcastsi128_si256() {
4248        let a = _mm_setr_epi64x(0x0987654321012334, 0x5678909876543210);
4249        let res = _mm256_broadcastsi128_si256(a);
4250        let retval = _mm256_setr_epi64x(
4251            0x0987654321012334,
4252            0x5678909876543210,
4253            0x0987654321012334,
4254            0x5678909876543210,
4255        );
4256        assert_eq_m256i(res, retval);
4257    }
4258
4259    #[simd_test(enable = "avx2")]
4260    unsafe fn test_mm_broadcastss_ps() {
4261        let a = _mm_setr_ps(6.88, 3.44, 0.0, 0.0);
4262        let res = _mm_broadcastss_ps(a);
4263        assert_eq_m128(res, _mm_set1_ps(6.88));
4264    }
4265
4266    #[simd_test(enable = "avx2")]
4267    unsafe fn test_mm256_broadcastss_ps() {
4268        let a = _mm_setr_ps(6.88, 3.44, 0.0, 0.0);
4269        let res = _mm256_broadcastss_ps(a);
4270        assert_eq_m256(res, _mm256_set1_ps(6.88));
4271    }
4272
4273    #[simd_test(enable = "avx2")]
4274    unsafe fn test_mm_broadcastw_epi16() {
4275        let a = _mm_insert_epi16::<0>(_mm_set1_epi16(0x2a), 0x22b);
4276        let res = _mm_broadcastw_epi16(a);
4277        assert_eq_m128i(res, _mm_set1_epi16(0x22b));
4278    }
4279
4280    #[simd_test(enable = "avx2")]
4281    unsafe fn test_mm256_broadcastw_epi16() {
4282        let a = _mm_insert_epi16::<0>(_mm_set1_epi16(0x2a), 0x22b);
4283        let res = _mm256_broadcastw_epi16(a);
4284        assert_eq_m256i(res, _mm256_set1_epi16(0x22b));
4285    }
4286
4287    #[simd_test(enable = "avx2")]
4288    unsafe fn test_mm256_cmpeq_epi8() {
4289        #[rustfmt::skip]
4290        let a = _mm256_setr_epi8(
4291            0, 1, 2, 3, 4, 5, 6, 7,
4292            8, 9, 10, 11, 12, 13, 14, 15,
4293            16, 17, 18, 19, 20, 21, 22, 23,
4294            24, 25, 26, 27, 28, 29, 30, 31,
4295        );
4296        #[rustfmt::skip]
4297        let b = _mm256_setr_epi8(
4298            31, 30, 2, 28, 27, 26, 25, 24,
4299            23, 22, 21, 20, 19, 18, 17, 16,
4300            15, 14, 13, 12, 11, 10, 9, 8,
4301            7, 6, 5, 4, 3, 2, 1, 0,
4302        );
4303        let r = _mm256_cmpeq_epi8(a, b);
4304        assert_eq_m256i(r, _mm256_insert_epi8::<2>(_mm256_set1_epi8(0), !0));
4305    }
4306
4307    #[simd_test(enable = "avx2")]
4308    unsafe fn test_mm256_cmpeq_epi16() {
4309        #[rustfmt::skip]
4310        let a = _mm256_setr_epi16(
4311            0, 1, 2, 3, 4, 5, 6, 7,
4312            8, 9, 10, 11, 12, 13, 14, 15,
4313        );
4314        #[rustfmt::skip]
4315        let b = _mm256_setr_epi16(
4316            15, 14, 2, 12, 11, 10, 9, 8,
4317            7, 6, 5, 4, 3, 2, 1, 0,
4318        );
4319        let r = _mm256_cmpeq_epi16(a, b);
4320        assert_eq_m256i(r, _mm256_insert_epi16::<2>(_mm256_set1_epi16(0), !0));
4321    }
4322
4323    #[simd_test(enable = "avx2")]
4324    unsafe fn test_mm256_cmpeq_epi32() {
4325        let a = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
4326        let b = _mm256_setr_epi32(7, 6, 2, 4, 3, 2, 1, 0);
4327        let r = _mm256_cmpeq_epi32(a, b);
4328        let e = _mm256_set1_epi32(0);
4329        let e = _mm256_insert_epi32::<2>(e, !0);
4330        assert_eq_m256i(r, e);
4331    }
4332
4333    #[simd_test(enable = "avx2")]
4334    unsafe fn test_mm256_cmpeq_epi64() {
4335        let a = _mm256_setr_epi64x(0, 1, 2, 3);
4336        let b = _mm256_setr_epi64x(3, 2, 2, 0);
4337        let r = _mm256_cmpeq_epi64(a, b);
4338        assert_eq_m256i(r, _mm256_insert_epi64::<2>(_mm256_set1_epi64x(0), !0));
4339    }
4340
4341    #[simd_test(enable = "avx2")]
4342    unsafe fn test_mm256_cmpgt_epi8() {
4343        let a = _mm256_insert_epi8::<0>(_mm256_set1_epi8(0), 5);
4344        let b = _mm256_set1_epi8(0);
4345        let r = _mm256_cmpgt_epi8(a, b);
4346        assert_eq_m256i(r, _mm256_insert_epi8::<0>(_mm256_set1_epi8(0), !0));
4347    }
4348
4349    #[simd_test(enable = "avx2")]
4350    unsafe fn test_mm256_cmpgt_epi16() {
4351        let a = _mm256_insert_epi16::<0>(_mm256_set1_epi16(0), 5);
4352        let b = _mm256_set1_epi16(0);
4353        let r = _mm256_cmpgt_epi16(a, b);
4354        assert_eq_m256i(r, _mm256_insert_epi16::<0>(_mm256_set1_epi16(0), !0));
4355    }
4356
4357    #[simd_test(enable = "avx2")]
4358    unsafe fn test_mm256_cmpgt_epi32() {
4359        let a = _mm256_insert_epi32::<0>(_mm256_set1_epi32(0), 5);
4360        let b = _mm256_set1_epi32(0);
4361        let r = _mm256_cmpgt_epi32(a, b);
4362        assert_eq_m256i(r, _mm256_insert_epi32::<0>(_mm256_set1_epi32(0), !0));
4363    }
4364
4365    #[simd_test(enable = "avx2")]
4366    unsafe fn test_mm256_cmpgt_epi64() {
4367        let a = _mm256_insert_epi64::<0>(_mm256_set1_epi64x(0), 5);
4368        let b = _mm256_set1_epi64x(0);
4369        let r = _mm256_cmpgt_epi64(a, b);
4370        assert_eq_m256i(r, _mm256_insert_epi64::<0>(_mm256_set1_epi64x(0), !0));
4371    }
4372
4373    #[simd_test(enable = "avx2")]
4374    unsafe fn test_mm256_cvtepi8_epi16() {
4375        #[rustfmt::skip]
4376        let a = _mm_setr_epi8(
4377            0, 0, -1, 1, -2, 2, -3, 3,
4378            -4, 4, -5, 5, -6, 6, -7, 7,
4379        );
4380        #[rustfmt::skip]
4381        let r = _mm256_setr_epi16(
4382            0, 0, -1, 1, -2, 2, -3, 3,
4383            -4, 4, -5, 5, -6, 6, -7, 7,
4384        );
4385        assert_eq_m256i(r, _mm256_cvtepi8_epi16(a));
4386    }
4387
4388    #[simd_test(enable = "avx2")]
4389    unsafe fn test_mm256_cvtepi8_epi32() {
4390        #[rustfmt::skip]
4391        let a = _mm_setr_epi8(
4392            0, 0, -1, 1, -2, 2, -3, 3,
4393            -4, 4, -5, 5, -6, 6, -7, 7,
4394        );
4395        let r = _mm256_setr_epi32(0, 0, -1, 1, -2, 2, -3, 3);
4396        assert_eq_m256i(r, _mm256_cvtepi8_epi32(a));
4397    }
4398
4399    #[simd_test(enable = "avx2")]
4400    unsafe fn test_mm256_cvtepi8_epi64() {
4401        #[rustfmt::skip]
4402        let a = _mm_setr_epi8(
4403            0, 0, -1, 1, -2, 2, -3, 3,
4404            -4, 4, -5, 5, -6, 6, -7, 7,
4405        );
4406        let r = _mm256_setr_epi64x(0, 0, -1, 1);
4407        assert_eq_m256i(r, _mm256_cvtepi8_epi64(a));
4408    }
4409
4410    #[simd_test(enable = "avx2")]
4411    unsafe fn test_mm256_cvtepi16_epi32() {
4412        let a = _mm_setr_epi16(0, 0, -1, 1, -2, 2, -3, 3);
4413        let r = _mm256_setr_epi32(0, 0, -1, 1, -2, 2, -3, 3);
4414        assert_eq_m256i(r, _mm256_cvtepi16_epi32(a));
4415    }
4416
4417    #[simd_test(enable = "avx2")]
4418    unsafe fn test_mm256_cvtepi16_epi64() {
4419        let a = _mm_setr_epi16(0, 0, -1, 1, -2, 2, -3, 3);
4420        let r = _mm256_setr_epi64x(0, 0, -1, 1);
4421        assert_eq_m256i(r, _mm256_cvtepi16_epi64(a));
4422    }
4423
4424    #[simd_test(enable = "avx2")]
4425    unsafe fn test_mm256_cvtepi32_epi64() {
4426        let a = _mm_setr_epi32(0, 0, -1, 1);
4427        let r = _mm256_setr_epi64x(0, 0, -1, 1);
4428        assert_eq_m256i(r, _mm256_cvtepi32_epi64(a));
4429    }
4430
4431    #[simd_test(enable = "avx2")]
4432    unsafe fn test_mm256_cvtepu16_epi32() {
4433        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4434        let r = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
4435        assert_eq_m256i(r, _mm256_cvtepu16_epi32(a));
4436    }
4437
4438    #[simd_test(enable = "avx2")]
4439    unsafe fn test_mm256_cvtepu16_epi64() {
4440        let a = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
4441        let r = _mm256_setr_epi64x(0, 1, 2, 3);
4442        assert_eq_m256i(r, _mm256_cvtepu16_epi64(a));
4443    }
4444
4445    #[simd_test(enable = "avx2")]
4446    unsafe fn test_mm256_cvtepu32_epi64() {
4447        let a = _mm_setr_epi32(0, 1, 2, 3);
4448        let r = _mm256_setr_epi64x(0, 1, 2, 3);
4449        assert_eq_m256i(r, _mm256_cvtepu32_epi64(a));
4450    }
4451
4452    #[simd_test(enable = "avx2")]
4453    unsafe fn test_mm256_cvtepu8_epi16() {
4454        #[rustfmt::skip]
4455        let a = _mm_setr_epi8(
4456            0, 1, 2, 3, 4, 5, 6, 7,
4457            8, 9, 10, 11, 12, 13, 14, 15,
4458        );
4459        #[rustfmt::skip]
4460        let r = _mm256_setr_epi16(
4461            0, 1, 2, 3, 4, 5, 6, 7,
4462            8, 9, 10, 11, 12, 13, 14, 15,
4463        );
4464        assert_eq_m256i(r, _mm256_cvtepu8_epi16(a));
4465    }
4466
4467    #[simd_test(enable = "avx2")]
4468    unsafe fn test_mm256_cvtepu8_epi32() {
4469        #[rustfmt::skip]
4470        let a = _mm_setr_epi8(
4471            0, 1, 2, 3, 4, 5, 6, 7,
4472            8, 9, 10, 11, 12, 13, 14, 15,
4473        );
4474        let r = _mm256_setr_epi32(0, 1, 2, 3, 4, 5, 6, 7);
4475        assert_eq_m256i(r, _mm256_cvtepu8_epi32(a));
4476    }
4477
4478    #[simd_test(enable = "avx2")]
4479    unsafe fn test_mm256_cvtepu8_epi64() {
4480        #[rustfmt::skip]
4481        let a = _mm_setr_epi8(
4482            0, 1, 2, 3, 4, 5, 6, 7,
4483            8, 9, 10, 11, 12, 13, 14, 15,
4484        );
4485        let r = _mm256_setr_epi64x(0, 1, 2, 3);
4486        assert_eq_m256i(r, _mm256_cvtepu8_epi64(a));
4487    }
4488
4489    #[simd_test(enable = "avx2")]
4490    unsafe fn test_mm256_extracti128_si256() {
4491        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4492        let r = _mm256_extracti128_si256::<1>(a);
4493        let e = _mm_setr_epi64x(3, 4);
4494        assert_eq_m128i(r, e);
4495    }
4496
4497    #[simd_test(enable = "avx2")]
4498    unsafe fn test_mm256_hadd_epi16() {
4499        let a = _mm256_set1_epi16(2);
4500        let b = _mm256_set1_epi16(4);
4501        let r = _mm256_hadd_epi16(a, b);
4502        let e = _mm256_setr_epi16(4, 4, 4, 4, 8, 8, 8, 8, 4, 4, 4, 4, 8, 8, 8, 8);
4503        assert_eq_m256i(r, e);
4504    }
4505
4506    #[simd_test(enable = "avx2")]
4507    unsafe fn test_mm256_hadd_epi32() {
4508        let a = _mm256_set1_epi32(2);
4509        let b = _mm256_set1_epi32(4);
4510        let r = _mm256_hadd_epi32(a, b);
4511        let e = _mm256_setr_epi32(4, 4, 8, 8, 4, 4, 8, 8);
4512        assert_eq_m256i(r, e);
4513    }
4514
4515    #[simd_test(enable = "avx2")]
4516    unsafe fn test_mm256_hadds_epi16() {
4517        let a = _mm256_set1_epi16(2);
4518        let a = _mm256_insert_epi16::<0>(a, 0x7fff);
4519        let a = _mm256_insert_epi16::<1>(a, 1);
4520        let b = _mm256_set1_epi16(4);
4521        let r = _mm256_hadds_epi16(a, b);
4522        #[rustfmt::skip]
4523        let e = _mm256_setr_epi16(
4524            0x7FFF, 4, 4, 4, 8, 8, 8, 8,
4525            4, 4, 4, 4, 8, 8, 8, 8,
4526        );
4527        assert_eq_m256i(r, e);
4528    }
4529
4530    #[simd_test(enable = "avx2")]
4531    unsafe fn test_mm256_hsub_epi16() {
4532        let a = _mm256_set1_epi16(2);
4533        let b = _mm256_set1_epi16(4);
4534        let r = _mm256_hsub_epi16(a, b);
4535        let e = _mm256_set1_epi16(0);
4536        assert_eq_m256i(r, e);
4537    }
4538
4539    #[simd_test(enable = "avx2")]
4540    unsafe fn test_mm256_hsub_epi32() {
4541        let a = _mm256_set1_epi32(2);
4542        let b = _mm256_set1_epi32(4);
4543        let r = _mm256_hsub_epi32(a, b);
4544        let e = _mm256_set1_epi32(0);
4545        assert_eq_m256i(r, e);
4546    }
4547
4548    #[simd_test(enable = "avx2")]
4549    unsafe fn test_mm256_hsubs_epi16() {
4550        let a = _mm256_set1_epi16(2);
4551        let a = _mm256_insert_epi16::<0>(a, 0x7fff);
4552        let a = _mm256_insert_epi16::<1>(a, -1);
4553        let b = _mm256_set1_epi16(4);
4554        let r = _mm256_hsubs_epi16(a, b);
4555        let e = _mm256_insert_epi16::<0>(_mm256_set1_epi16(0), 0x7FFF);
4556        assert_eq_m256i(r, e);
4557    }
4558
4559    #[simd_test(enable = "avx2")]
4560    unsafe fn test_mm256_madd_epi16() {
4561        let a = _mm256_set1_epi16(2);
4562        let b = _mm256_set1_epi16(4);
4563        let r = _mm256_madd_epi16(a, b);
4564        let e = _mm256_set1_epi32(16);
4565        assert_eq_m256i(r, e);
4566    }
4567
4568    #[simd_test(enable = "avx2")]
4569    unsafe fn test_mm256_inserti128_si256() {
4570        let a = _mm256_setr_epi64x(1, 2, 3, 4);
4571        let b = _mm_setr_epi64x(7, 8);
4572        let r = _mm256_inserti128_si256::<1>(a, b);
4573        let e = _mm256_setr_epi64x(1, 2, 7, 8);
4574        assert_eq_m256i(r, e);
4575    }
4576
4577    #[simd_test(enable = "avx2")]
4578    unsafe fn test_mm256_maddubs_epi16() {
4579        let a = _mm256_set1_epi8(2);
4580        let b = _mm256_set1_epi8(4);
4581        let r = _mm256_maddubs_epi16(a, b);
4582        let e = _mm256_set1_epi16(16);
4583        assert_eq_m256i(r, e);
4584    }
4585
4586    #[simd_test(enable = "avx2")]
4587    unsafe fn test_mm_maskload_epi32() {
4588        let nums = [1, 2, 3, 4];
4589        let a = &nums as *const i32;
4590        let mask = _mm_setr_epi32(-1, 0, 0, -1);
4591        let r = _mm_maskload_epi32(a, mask);
4592        let e = _mm_setr_epi32(1, 0, 0, 4);
4593        assert_eq_m128i(r, e);
4594    }
4595
4596    #[simd_test(enable = "avx2")]
4597    unsafe fn test_mm256_maskload_epi32() {
4598        let nums = [1, 2, 3, 4, 5, 6, 7, 8];
4599        let a = &nums as *const i32;
4600        let mask = _mm256_setr_epi32(-1, 0, 0, -1, 0, -1, -1, 0);
4601        let r = _mm256_maskload_epi32(a, mask);
4602        let e = _mm256_setr_epi32(1, 0, 0, 4, 0, 6, 7, 0);
4603        assert_eq_m256i(r, e);
4604    }
4605
4606    #[simd_test(enable = "avx2")]
4607    unsafe fn test_mm_maskload_epi64() {
4608        let nums = [1_i64, 2_i64];
4609        let a = &nums as *const i64;
4610        let mask = _mm_setr_epi64x(0, -1);
4611        let r = _mm_maskload_epi64(a, mask);
4612        let e = _mm_setr_epi64x(0, 2);
4613        assert_eq_m128i(r, e);
4614    }
4615
4616    #[simd_test(enable = "avx2")]
4617    unsafe fn test_mm256_maskload_epi64() {
4618        let nums = [1_i64, 2_i64, 3_i64, 4_i64];
4619        let a = &nums as *const i64;
4620        let mask = _mm256_setr_epi64x(0, -1, -1, 0);
4621        let r = _mm256_maskload_epi64(a, mask);
4622        let e = _mm256_setr_epi64x(0, 2, 3, 0);
4623        assert_eq_m256i(r, e);
4624    }
4625
4626    #[simd_test(enable = "avx2")]
4627    unsafe fn test_mm_maskstore_epi32() {
4628        let a = _mm_setr_epi32(1, 2, 3, 4);
4629        let mut arr = [-1, -1, -1, -1];
4630        let mask = _mm_setr_epi32(-1, 0, 0, -1);
4631        _mm_maskstore_epi32(arr.as_mut_ptr(), mask, a);
4632        let e = [1, -1, -1, 4];
4633        assert_eq!(arr, e);
4634    }
4635
4636    #[simd_test(enable = "avx2")]
4637    unsafe fn test_mm256_maskstore_epi32() {
4638        let a = _mm256_setr_epi32(1, 0x6d726f, 3, 42, 0x777161, 6, 7, 8);
4639        let mut arr = [-1, -1, -1, 0x776173, -1, 0x68657265, -1, -1];
4640        let mask = _mm256_setr_epi32(-1, 0, 0, -1, 0, -1, -1, 0);
4641        _mm256_maskstore_epi32(arr.as_mut_ptr(), mask, a);
4642        let e = [1, -1, -1, 42, -1, 6, 7, -1];
4643        assert_eq!(arr, e);
4644    }
4645
4646    #[simd_test(enable = "avx2")]
4647    unsafe fn test_mm_maskstore_epi64() {
4648        let a = _mm_setr_epi64x(1_i64, 2_i64);
4649        let mut arr = [-1_i64, -1_i64];
4650        let mask = _mm_setr_epi64x(0, -1);
4651        _mm_maskstore_epi64(arr.as_mut_ptr(), mask, a);
4652        let e = [-1, 2];
4653        assert_eq!(arr, e);
4654    }
4655
4656    #[simd_test(enable = "avx2")]
4657    unsafe fn test_mm256_maskstore_epi64() {
4658        let a = _mm256_setr_epi64x(1_i64, 2_i64, 3_i64, 4_i64);
4659        let mut arr = [-1_i64, -1_i64, -1_i64, -1_i64];
4660        let mask = _mm256_setr_epi64x(0, -1, -1, 0);
4661        _mm256_maskstore_epi64(arr.as_mut_ptr(), mask, a);
4662        let e = [-1, 2, 3, -1];
4663        assert_eq!(arr, e);
4664    }
4665
4666    #[simd_test(enable = "avx2")]
4667    unsafe fn test_mm256_max_epi16() {
4668        let a = _mm256_set1_epi16(2);
4669        let b = _mm256_set1_epi16(4);
4670        let r = _mm256_max_epi16(a, b);
4671        assert_eq_m256i(r, b);
4672    }
4673
4674    #[simd_test(enable = "avx2")]
4675    unsafe fn test_mm256_max_epi32() {
4676        let a = _mm256_set1_epi32(2);
4677        let b = _mm256_set1_epi32(4);
4678        let r = _mm256_max_epi32(a, b);
4679        assert_eq_m256i(r, b);
4680    }
4681
4682    #[simd_test(enable = "avx2")]
4683    unsafe fn test_mm256_max_epi8() {
4684        let a = _mm256_set1_epi8(2);
4685        let b = _mm256_set1_epi8(4);
4686        let r = _mm256_max_epi8(a, b);
4687        assert_eq_m256i(r, b);
4688    }
4689
4690    #[simd_test(enable = "avx2")]
4691    unsafe fn test_mm256_max_epu16() {
4692        let a = _mm256_set1_epi16(2);
4693        let b = _mm256_set1_epi16(4);
4694        let r = _mm256_max_epu16(a, b);
4695        assert_eq_m256i(r, b);
4696    }
4697
4698    #[simd_test(enable = "avx2")]
4699    unsafe fn test_mm256_max_epu32() {
4700        let a = _mm256_set1_epi32(2);
4701        let b = _mm256_set1_epi32(4);
4702        let r = _mm256_max_epu32(a, b);
4703        assert_eq_m256i(r, b);
4704    }
4705
4706    #[simd_test(enable = "avx2")]
4707    unsafe fn test_mm256_max_epu8() {
4708        let a = _mm256_set1_epi8(2);
4709        let b = _mm256_set1_epi8(4);
4710        let r = _mm256_max_epu8(a, b);
4711        assert_eq_m256i(r, b);
4712    }
4713
4714    #[simd_test(enable = "avx2")]
4715    unsafe fn test_mm256_min_epi16() {
4716        let a = _mm256_set1_epi16(2);
4717        let b = _mm256_set1_epi16(4);
4718        let r = _mm256_min_epi16(a, b);
4719        assert_eq_m256i(r, a);
4720    }
4721
4722    #[simd_test(enable = "avx2")]
4723    unsafe fn test_mm256_min_epi32() {
4724        let a = _mm256_set1_epi32(2);
4725        let b = _mm256_set1_epi32(4);
4726        let r = _mm256_min_epi32(a, b);
4727        assert_eq_m256i(r, a);
4728    }
4729
4730    #[simd_test(enable = "avx2")]
4731    unsafe fn test_mm256_min_epi8() {
4732        let a = _mm256_set1_epi8(2);
4733        let b = _mm256_set1_epi8(4);
4734        let r = _mm256_min_epi8(a, b);
4735        assert_eq_m256i(r, a);
4736    }
4737
4738    #[simd_test(enable = "avx2")]
4739    unsafe fn test_mm256_min_epu16() {
4740        let a = _mm256_set1_epi16(2);
4741        let b = _mm256_set1_epi16(4);
4742        let r = _mm256_min_epu16(a, b);
4743        assert_eq_m256i(r, a);
4744    }
4745
4746    #[simd_test(enable = "avx2")]
4747    unsafe fn test_mm256_min_epu32() {
4748        let a = _mm256_set1_epi32(2);
4749        let b = _mm256_set1_epi32(4);
4750        let r = _mm256_min_epu32(a, b);
4751        assert_eq_m256i(r, a);
4752    }
4753
4754    #[simd_test(enable = "avx2")]
4755    unsafe fn test_mm256_min_epu8() {
4756        let a = _mm256_set1_epi8(2);
4757        let b = _mm256_set1_epi8(4);
4758        let r = _mm256_min_epu8(a, b);
4759        assert_eq_m256i(r, a);
4760    }
4761
4762    #[simd_test(enable = "avx2")]
4763    unsafe fn test_mm256_movemask_epi8() {
4764        let a = _mm256_set1_epi8(-1);
4765        let r = _mm256_movemask_epi8(a);
4766        let e = -1;
4767        assert_eq!(r, e);
4768    }
4769
4770    #[simd_test(enable = "avx2")]
4771    unsafe fn test_mm256_mpsadbw_epu8() {
4772        let a = _mm256_set1_epi8(2);
4773        let b = _mm256_set1_epi8(4);
4774        let r = _mm256_mpsadbw_epu8::<0>(a, b);
4775        let e = _mm256_set1_epi16(8);
4776        assert_eq_m256i(r, e);
4777    }
4778
4779    #[simd_test(enable = "avx2")]
4780    unsafe fn test_mm256_mul_epi32() {
4781        let a = _mm256_setr_epi32(0, 0, 0, 0, 2, 2, 2, 2);
4782        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4783        let r = _mm256_mul_epi32(a, b);
4784        let e = _mm256_setr_epi64x(0, 0, 10, 14);
4785        assert_eq_m256i(r, e);
4786    }
4787
4788    #[simd_test(enable = "avx2")]
4789    unsafe fn test_mm256_mul_epu32() {
4790        let a = _mm256_setr_epi32(0, 0, 0, 0, 2, 2, 2, 2);
4791        let b = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 8);
4792        let r = _mm256_mul_epu32(a, b);
4793        let e = _mm256_setr_epi64x(0, 0, 10, 14);
4794        assert_eq_m256i(r, e);
4795    }
4796
4797    #[simd_test(enable = "avx2")]
4798    unsafe fn test_mm256_mulhi_epi16() {
4799        let a = _mm256_set1_epi16(6535);
4800        let b = _mm256_set1_epi16(6535);
4801        let r = _mm256_mulhi_epi16(a, b);
4802        let e = _mm256_set1_epi16(651);
4803        assert_eq_m256i(r, e);
4804    }
4805
4806    #[simd_test(enable = "avx2")]
4807    unsafe fn test_mm256_mulhi_epu16() {
4808        let a = _mm256_set1_epi16(6535);
4809        let b = _mm256_set1_epi16(6535);
4810        let r = _mm256_mulhi_epu16(a, b);
4811        let e = _mm256_set1_epi16(651);
4812        assert_eq_m256i(r, e);
4813    }
4814
4815    #[simd_test(enable = "avx2")]
4816    unsafe fn test_mm256_mullo_epi16() {
4817        let a = _mm256_set1_epi16(2);
4818        let b = _mm256_set1_epi16(4);
4819        let r = _mm256_mullo_epi16(a, b);
4820        let e = _mm256_set1_epi16(8);
4821        assert_eq_m256i(r, e);
4822    }
4823
4824    #[simd_test(enable = "avx2")]
4825    unsafe fn test_mm256_mullo_epi32() {
4826        let a = _mm256_set1_epi32(2);
4827        let b = _mm256_set1_epi32(4);
4828        let r = _mm256_mullo_epi32(a, b);
4829        let e = _mm256_set1_epi32(8);
4830        assert_eq_m256i(r, e);
4831    }
4832
4833    #[simd_test(enable = "avx2")]
4834    unsafe fn test_mm256_mulhrs_epi16() {
4835        let a = _mm256_set1_epi16(2);
4836        let b = _mm256_set1_epi16(4);
4837        let r = _mm256_mullo_epi16(a, b);
4838        let e = _mm256_set1_epi16(8);
4839        assert_eq_m256i(r, e);
4840    }
4841
4842    #[simd_test(enable = "avx2")]
4843    unsafe fn test_mm256_or_si256() {
4844        let a = _mm256_set1_epi8(-1);
4845        let b = _mm256_set1_epi8(0);
4846        let r = _mm256_or_si256(a, b);
4847        assert_eq_m256i(r, a);
4848    }
4849
4850    #[simd_test(enable = "avx2")]
4851    unsafe fn test_mm256_packs_epi16() {
4852        let a = _mm256_set1_epi16(2);
4853        let b = _mm256_set1_epi16(4);
4854        let r = _mm256_packs_epi16(a, b);
4855        #[rustfmt::skip]
4856        let e = _mm256_setr_epi8(
4857            2, 2, 2, 2, 2, 2, 2, 2,
4858            4, 4, 4, 4, 4, 4, 4, 4,
4859            2, 2, 2, 2, 2, 2, 2, 2,
4860            4, 4, 4, 4, 4, 4, 4, 4,
4861        );
4862
4863        assert_eq_m256i(r, e);
4864    }
4865
4866    #[simd_test(enable = "avx2")]
4867    unsafe fn test_mm256_packs_epi32() {
4868        let a = _mm256_set1_epi32(2);
4869        let b = _mm256_set1_epi32(4);
4870        let r = _mm256_packs_epi32(a, b);
4871        let e = _mm256_setr_epi16(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4);
4872
4873        assert_eq_m256i(r, e);
4874    }
4875
4876    #[simd_test(enable = "avx2")]
4877    unsafe fn test_mm256_packus_epi16() {
4878        let a = _mm256_set1_epi16(2);
4879        let b = _mm256_set1_epi16(4);
4880        let r = _mm256_packus_epi16(a, b);
4881        #[rustfmt::skip]
4882        let e = _mm256_setr_epi8(
4883            2, 2, 2, 2, 2, 2, 2, 2,
4884            4, 4, 4, 4, 4, 4, 4, 4,
4885            2, 2, 2, 2, 2, 2, 2, 2,
4886            4, 4, 4, 4, 4, 4, 4, 4,
4887        );
4888
4889        assert_eq_m256i(r, e);
4890    }
4891
4892    #[simd_test(enable = "avx2")]
4893    unsafe fn test_mm256_packus_epi32() {
4894        let a = _mm256_set1_epi32(2);
4895        let b = _mm256_set1_epi32(4);
4896        let r = _mm256_packus_epi32(a, b);
4897        let e = _mm256_setr_epi16(2, 2, 2, 2, 4, 4, 4, 4, 2, 2, 2, 2, 4, 4, 4, 4);
4898
4899        assert_eq_m256i(r, e);
4900    }
4901
4902    #[simd_test(enable = "avx2")]
4903    unsafe fn test_mm256_sad_epu8() {
4904        let a = _mm256_set1_epi8(2);
4905        let b = _mm256_set1_epi8(4);
4906        let r = _mm256_sad_epu8(a, b);
4907        let e = _mm256_set1_epi64x(16);
4908        assert_eq_m256i(r, e);
4909    }
4910
4911    #[simd_test(enable = "avx2")]
4912    unsafe fn test_mm256_shufflehi_epi16() {
4913        #[rustfmt::skip]
4914        let a = _mm256_setr_epi16(
4915            0, 1, 2, 3, 11, 22, 33, 44,
4916            4, 5, 6, 7, 55, 66, 77, 88,
4917        );
4918        #[rustfmt::skip]
4919        let e = _mm256_setr_epi16(
4920            0, 1, 2, 3, 44, 22, 22, 11,
4921            4, 5, 6, 7, 88, 66, 66, 55,
4922        );
4923        let r = _mm256_shufflehi_epi16::<0b00_01_01_11>(a);
4924        assert_eq_m256i(r, e);
4925    }
4926
4927    #[simd_test(enable = "avx2")]
4928    unsafe fn test_mm256_shufflelo_epi16() {
4929        #[rustfmt::skip]
4930        let a = _mm256_setr_epi16(
4931            11, 22, 33, 44, 0, 1, 2, 3,
4932            55, 66, 77, 88, 4, 5, 6, 7,
4933        );
4934        #[rustfmt::skip]
4935        let e = _mm256_setr_epi16(
4936            44, 22, 22, 11, 0, 1, 2, 3,
4937            88, 66, 66, 55, 4, 5, 6, 7,
4938        );
4939        let r = _mm256_shufflelo_epi16::<0b00_01_01_11>(a);
4940        assert_eq_m256i(r, e);
4941    }
4942
4943    #[simd_test(enable = "avx2")]
4944    unsafe fn test_mm256_sign_epi16() {
4945        let a = _mm256_set1_epi16(2);
4946        let b = _mm256_set1_epi16(-1);
4947        let r = _mm256_sign_epi16(a, b);
4948        let e = _mm256_set1_epi16(-2);
4949        assert_eq_m256i(r, e);
4950    }
4951
4952    #[simd_test(enable = "avx2")]
4953    unsafe fn test_mm256_sign_epi32() {
4954        let a = _mm256_set1_epi32(2);
4955        let b = _mm256_set1_epi32(-1);
4956        let r = _mm256_sign_epi32(a, b);
4957        let e = _mm256_set1_epi32(-2);
4958        assert_eq_m256i(r, e);
4959    }
4960
4961    #[simd_test(enable = "avx2")]
4962    unsafe fn test_mm256_sign_epi8() {
4963        let a = _mm256_set1_epi8(2);
4964        let b = _mm256_set1_epi8(-1);
4965        let r = _mm256_sign_epi8(a, b);
4966        let e = _mm256_set1_epi8(-2);
4967        assert_eq_m256i(r, e);
4968    }
4969
4970    #[simd_test(enable = "avx2")]
4971    unsafe fn test_mm256_sll_epi16() {
4972        let a = _mm256_set1_epi16(0xFF);
4973        let b = _mm_insert_epi16::<0>(_mm_set1_epi16(0), 4);
4974        let r = _mm256_sll_epi16(a, b);
4975        assert_eq_m256i(r, _mm256_set1_epi16(0xFF0));
4976    }
4977
4978    #[simd_test(enable = "avx2")]
4979    unsafe fn test_mm256_sll_epi32() {
4980        let a = _mm256_set1_epi32(0xFFFF);
4981        let b = _mm_insert_epi32::<0>(_mm_set1_epi32(0), 4);
4982        let r = _mm256_sll_epi32(a, b);
4983        assert_eq_m256i(r, _mm256_set1_epi32(0xFFFF0));
4984    }
4985
4986    #[simd_test(enable = "avx2")]
4987    unsafe fn test_mm256_sll_epi64() {
4988        let a = _mm256_set1_epi64x(0xFFFFFFFF);
4989        let b = _mm_insert_epi64::<0>(_mm_set1_epi64x(0), 4);
4990        let r = _mm256_sll_epi64(a, b);
4991        assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFFF0));
4992    }
4993
4994    #[simd_test(enable = "avx2")]
4995    unsafe fn test_mm256_slli_epi16() {
4996        assert_eq_m256i(
4997            _mm256_slli_epi16::<4>(_mm256_set1_epi16(0xFF)),
4998            _mm256_set1_epi16(0xFF0),
4999        );
5000    }
5001
5002    #[simd_test(enable = "avx2")]
5003    unsafe fn test_mm256_slli_epi32() {
5004        assert_eq_m256i(
5005            _mm256_slli_epi32::<4>(_mm256_set1_epi32(0xFFFF)),
5006            _mm256_set1_epi32(0xFFFF0),
5007        );
5008    }
5009
5010    #[simd_test(enable = "avx2")]
5011    unsafe fn test_mm256_slli_epi64() {
5012        assert_eq_m256i(
5013            _mm256_slli_epi64::<4>(_mm256_set1_epi64x(0xFFFFFFFF)),
5014            _mm256_set1_epi64x(0xFFFFFFFF0),
5015        );
5016    }
5017
5018    #[simd_test(enable = "avx2")]
5019    unsafe fn test_mm256_slli_si256() {
5020        let a = _mm256_set1_epi64x(0xFFFFFFFF);
5021        let r = _mm256_slli_si256::<3>(a);
5022        assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFFF000000));
5023    }
5024
5025    #[simd_test(enable = "avx2")]
5026    unsafe fn test_mm_sllv_epi32() {
5027        let a = _mm_set1_epi32(2);
5028        let b = _mm_set1_epi32(1);
5029        let r = _mm_sllv_epi32(a, b);
5030        let e = _mm_set1_epi32(4);
5031        assert_eq_m128i(r, e);
5032    }
5033
5034    #[simd_test(enable = "avx2")]
5035    unsafe fn test_mm256_sllv_epi32() {
5036        let a = _mm256_set1_epi32(2);
5037        let b = _mm256_set1_epi32(1);
5038        let r = _mm256_sllv_epi32(a, b);
5039        let e = _mm256_set1_epi32(4);
5040        assert_eq_m256i(r, e);
5041    }
5042
5043    #[simd_test(enable = "avx2")]
5044    unsafe fn test_mm_sllv_epi64() {
5045        let a = _mm_set1_epi64x(2);
5046        let b = _mm_set1_epi64x(1);
5047        let r = _mm_sllv_epi64(a, b);
5048        let e = _mm_set1_epi64x(4);
5049        assert_eq_m128i(r, e);
5050    }
5051
5052    #[simd_test(enable = "avx2")]
5053    unsafe fn test_mm256_sllv_epi64() {
5054        let a = _mm256_set1_epi64x(2);
5055        let b = _mm256_set1_epi64x(1);
5056        let r = _mm256_sllv_epi64(a, b);
5057        let e = _mm256_set1_epi64x(4);
5058        assert_eq_m256i(r, e);
5059    }
5060
5061    #[simd_test(enable = "avx2")]
5062    unsafe fn test_mm256_sra_epi16() {
5063        let a = _mm256_set1_epi16(-1);
5064        let b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
5065        let r = _mm256_sra_epi16(a, b);
5066        assert_eq_m256i(r, _mm256_set1_epi16(-1));
5067    }
5068
5069    #[simd_test(enable = "avx2")]
5070    unsafe fn test_mm256_sra_epi32() {
5071        let a = _mm256_set1_epi32(-1);
5072        let b = _mm_insert_epi32::<0>(_mm_set1_epi32(0), 1);
5073        let r = _mm256_sra_epi32(a, b);
5074        assert_eq_m256i(r, _mm256_set1_epi32(-1));
5075    }
5076
5077    #[simd_test(enable = "avx2")]
5078    unsafe fn test_mm256_srai_epi16() {
5079        assert_eq_m256i(
5080            _mm256_srai_epi16::<1>(_mm256_set1_epi16(-1)),
5081            _mm256_set1_epi16(-1),
5082        );
5083    }
5084
5085    #[simd_test(enable = "avx2")]
5086    unsafe fn test_mm256_srai_epi32() {
5087        assert_eq_m256i(
5088            _mm256_srai_epi32::<1>(_mm256_set1_epi32(-1)),
5089            _mm256_set1_epi32(-1),
5090        );
5091    }
5092
5093    #[simd_test(enable = "avx2")]
5094    unsafe fn test_mm_srav_epi32() {
5095        let a = _mm_set1_epi32(4);
5096        let count = _mm_set1_epi32(1);
5097        let r = _mm_srav_epi32(a, count);
5098        let e = _mm_set1_epi32(2);
5099        assert_eq_m128i(r, e);
5100    }
5101
5102    #[simd_test(enable = "avx2")]
5103    unsafe fn test_mm256_srav_epi32() {
5104        let a = _mm256_set1_epi32(4);
5105        let count = _mm256_set1_epi32(1);
5106        let r = _mm256_srav_epi32(a, count);
5107        let e = _mm256_set1_epi32(2);
5108        assert_eq_m256i(r, e);
5109    }
5110
5111    #[simd_test(enable = "avx2")]
5112    unsafe fn test_mm256_srli_si256() {
5113        #[rustfmt::skip]
5114        let a = _mm256_setr_epi8(
5115            1, 2, 3, 4, 5, 6, 7, 8,
5116            9, 10, 11, 12, 13, 14, 15, 16,
5117            17, 18, 19, 20, 21, 22, 23, 24,
5118            25, 26, 27, 28, 29, 30, 31, 32,
5119        );
5120        let r = _mm256_srli_si256::<3>(a);
5121        #[rustfmt::skip]
5122        let e = _mm256_setr_epi8(
5123            4, 5, 6, 7, 8, 9, 10, 11,
5124            12, 13, 14, 15, 16, 0, 0, 0,
5125            20, 21, 22, 23, 24, 25, 26, 27,
5126            28, 29, 30, 31, 32, 0, 0, 0,
5127        );
5128        assert_eq_m256i(r, e);
5129    }
5130
5131    #[simd_test(enable = "avx2")]
5132    unsafe fn test_mm256_srl_epi16() {
5133        let a = _mm256_set1_epi16(0xFF);
5134        let b = _mm_insert_epi16::<0>(_mm_set1_epi16(0), 4);
5135        let r = _mm256_srl_epi16(a, b);
5136        assert_eq_m256i(r, _mm256_set1_epi16(0xF));
5137    }
5138
5139    #[simd_test(enable = "avx2")]
5140    unsafe fn test_mm256_srl_epi32() {
5141        let a = _mm256_set1_epi32(0xFFFF);
5142        let b = _mm_insert_epi32::<0>(_mm_set1_epi32(0), 4);
5143        let r = _mm256_srl_epi32(a, b);
5144        assert_eq_m256i(r, _mm256_set1_epi32(0xFFF));
5145    }
5146
5147    #[simd_test(enable = "avx2")]
5148    unsafe fn test_mm256_srl_epi64() {
5149        let a = _mm256_set1_epi64x(0xFFFFFFFF);
5150        let b = _mm_setr_epi64x(4, 0);
5151        let r = _mm256_srl_epi64(a, b);
5152        assert_eq_m256i(r, _mm256_set1_epi64x(0xFFFFFFF));
5153    }
5154
5155    #[simd_test(enable = "avx2")]
5156    unsafe fn test_mm256_srli_epi16() {
5157        assert_eq_m256i(
5158            _mm256_srli_epi16::<4>(_mm256_set1_epi16(0xFF)),
5159            _mm256_set1_epi16(0xF),
5160        );
5161    }
5162
5163    #[simd_test(enable = "avx2")]
5164    unsafe fn test_mm256_srli_epi32() {
5165        assert_eq_m256i(
5166            _mm256_srli_epi32::<4>(_mm256_set1_epi32(0xFFFF)),
5167            _mm256_set1_epi32(0xFFF),
5168        );
5169    }
5170
5171    #[simd_test(enable = "avx2")]
5172    unsafe fn test_mm256_srli_epi64() {
5173        assert_eq_m256i(
5174            _mm256_srli_epi64::<4>(_mm256_set1_epi64x(0xFFFFFFFF)),
5175            _mm256_set1_epi64x(0xFFFFFFF),
5176        );
5177    }
5178
5179    #[simd_test(enable = "avx2")]
5180    unsafe fn test_mm_srlv_epi32() {
5181        let a = _mm_set1_epi32(2);
5182        let count = _mm_set1_epi32(1);
5183        let r = _mm_srlv_epi32(a, count);
5184        let e = _mm_set1_epi32(1);
5185        assert_eq_m128i(r, e);
5186    }
5187
5188    #[simd_test(enable = "avx2")]
5189    unsafe fn test_mm256_srlv_epi32() {
5190        let a = _mm256_set1_epi32(2);
5191        let count = _mm256_set1_epi32(1);
5192        let r = _mm256_srlv_epi32(a, count);
5193        let e = _mm256_set1_epi32(1);
5194        assert_eq_m256i(r, e);
5195    }
5196
5197    #[simd_test(enable = "avx2")]
5198    unsafe fn test_mm_srlv_epi64() {
5199        let a = _mm_set1_epi64x(2);
5200        let count = _mm_set1_epi64x(1);
5201        let r = _mm_srlv_epi64(a, count);
5202        let e = _mm_set1_epi64x(1);
5203        assert_eq_m128i(r, e);
5204    }
5205
5206    #[simd_test(enable = "avx2")]
5207    unsafe fn test_mm256_srlv_epi64() {
5208        let a = _mm256_set1_epi64x(2);
5209        let count = _mm256_set1_epi64x(1);
5210        let r = _mm256_srlv_epi64(a, count);
5211        let e = _mm256_set1_epi64x(1);
5212        assert_eq_m256i(r, e);
5213    }
5214
5215    #[simd_test(enable = "avx2")]
5216    unsafe fn test_mm256_stream_load_si256() {
5217        let a = _mm256_set_epi64x(5, 6, 7, 8);
5218        let r = _mm256_stream_load_si256(core::ptr::addr_of!(a) as *const _);
5219        assert_eq_m256i(a, r);
5220    }
5221
5222    #[simd_test(enable = "avx2")]
5223    unsafe fn test_mm256_sub_epi16() {
5224        let a = _mm256_set1_epi16(4);
5225        let b = _mm256_set1_epi16(2);
5226        let r = _mm256_sub_epi16(a, b);
5227        assert_eq_m256i(r, b);
5228    }
5229
5230    #[simd_test(enable = "avx2")]
5231    unsafe fn test_mm256_sub_epi32() {
5232        let a = _mm256_set1_epi32(4);
5233        let b = _mm256_set1_epi32(2);
5234        let r = _mm256_sub_epi32(a, b);
5235        assert_eq_m256i(r, b);
5236    }
5237
5238    #[simd_test(enable = "avx2")]
5239    unsafe fn test_mm256_sub_epi64() {
5240        let a = _mm256_set1_epi64x(4);
5241        let b = _mm256_set1_epi64x(2);
5242        let r = _mm256_sub_epi64(a, b);
5243        assert_eq_m256i(r, b);
5244    }
5245
5246    #[simd_test(enable = "avx2")]
5247    unsafe fn test_mm256_sub_epi8() {
5248        let a = _mm256_set1_epi8(4);
5249        let b = _mm256_set1_epi8(2);
5250        let r = _mm256_sub_epi8(a, b);
5251        assert_eq_m256i(r, b);
5252    }
5253
5254    #[simd_test(enable = "avx2")]
5255    unsafe fn test_mm256_subs_epi16() {
5256        let a = _mm256_set1_epi16(4);
5257        let b = _mm256_set1_epi16(2);
5258        let r = _mm256_subs_epi16(a, b);
5259        assert_eq_m256i(r, b);
5260    }
5261
5262    #[simd_test(enable = "avx2")]
5263    unsafe fn test_mm256_subs_epi8() {
5264        let a = _mm256_set1_epi8(4);
5265        let b = _mm256_set1_epi8(2);
5266        let r = _mm256_subs_epi8(a, b);
5267        assert_eq_m256i(r, b);
5268    }
5269
5270    #[simd_test(enable = "avx2")]
5271    unsafe fn test_mm256_subs_epu16() {
5272        let a = _mm256_set1_epi16(4);
5273        let b = _mm256_set1_epi16(2);
5274        let r = _mm256_subs_epu16(a, b);
5275        assert_eq_m256i(r, b);
5276    }
5277
5278    #[simd_test(enable = "avx2")]
5279    unsafe fn test_mm256_subs_epu8() {
5280        let a = _mm256_set1_epi8(4);
5281        let b = _mm256_set1_epi8(2);
5282        let r = _mm256_subs_epu8(a, b);
5283        assert_eq_m256i(r, b);
5284    }
5285
5286    #[simd_test(enable = "avx2")]
5287    unsafe fn test_mm256_xor_si256() {
5288        let a = _mm256_set1_epi8(5);
5289        let b = _mm256_set1_epi8(3);
5290        let r = _mm256_xor_si256(a, b);
5291        assert_eq_m256i(r, _mm256_set1_epi8(6));
5292    }
5293
5294    #[simd_test(enable = "avx2")]
5295    unsafe fn test_mm256_alignr_epi8() {
5296        #[rustfmt::skip]
5297        let a = _mm256_setr_epi8(
5298            1, 2, 3, 4, 5, 6, 7, 8,
5299            9, 10, 11, 12, 13, 14, 15, 16,
5300            17, 18, 19, 20, 21, 22, 23, 24,
5301            25, 26, 27, 28, 29, 30, 31, 32,
5302        );
5303        #[rustfmt::skip]
5304        let b = _mm256_setr_epi8(
5305            -1, -2, -3, -4, -5, -6, -7, -8,
5306            -9, -10, -11, -12, -13, -14, -15, -16,
5307            -17, -18, -19, -20, -21, -22, -23, -24,
5308            -25, -26, -27, -28, -29, -30, -31, -32,
5309        );
5310        let r = _mm256_alignr_epi8::<33>(a, b);
5311        assert_eq_m256i(r, _mm256_set1_epi8(0));
5312
5313        let r = _mm256_alignr_epi8::<17>(a, b);
5314        #[rustfmt::skip]
5315        let expected = _mm256_setr_epi8(
5316            2, 3, 4, 5, 6, 7, 8, 9,
5317            10, 11, 12, 13, 14, 15, 16, 0,
5318            18, 19, 20, 21, 22, 23, 24, 25,
5319            26, 27, 28, 29, 30, 31, 32, 0,
5320        );
5321        assert_eq_m256i(r, expected);
5322
5323        let r = _mm256_alignr_epi8::<4>(a, b);
5324        #[rustfmt::skip]
5325        let expected = _mm256_setr_epi8(
5326            -5, -6, -7, -8, -9, -10, -11, -12,
5327            -13, -14, -15, -16, 1, 2, 3, 4,
5328            -21, -22, -23, -24, -25, -26, -27, -28,
5329            -29, -30, -31, -32, 17, 18, 19, 20,
5330        );
5331        assert_eq_m256i(r, expected);
5332
5333        let r = _mm256_alignr_epi8::<15>(a, b);
5334        #[rustfmt::skip]
5335        let expected = _mm256_setr_epi8(
5336            -16, 1, 2, 3, 4, 5, 6, 7,
5337            8, 9, 10, 11, 12, 13, 14, 15,
5338            -32, 17, 18, 19, 20, 21, 22, 23,
5339            24, 25, 26, 27, 28, 29, 30, 31,
5340        );
5341        assert_eq_m256i(r, expected);
5342
5343        let r = _mm256_alignr_epi8::<0>(a, b);
5344        assert_eq_m256i(r, b);
5345
5346        let r = _mm256_alignr_epi8::<16>(a, b);
5347        assert_eq_m256i(r, a);
5348    }
5349
5350    #[simd_test(enable = "avx2")]
5351    unsafe fn test_mm256_shuffle_epi8() {
5352        #[rustfmt::skip]
5353        let a = _mm256_setr_epi8(
5354            1, 2, 3, 4, 5, 6, 7, 8,
5355            9, 10, 11, 12, 13, 14, 15, 16,
5356            17, 18, 19, 20, 21, 22, 23, 24,
5357            25, 26, 27, 28, 29, 30, 31, 32,
5358        );
5359        #[rustfmt::skip]
5360        let b = _mm256_setr_epi8(
5361            4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
5362            12, 5, 5, 10, 4, 1, 8, 0,
5363            4, 128u8 as i8, 4, 3, 24, 12, 6, 19,
5364            12, 5, 5, 10, 4, 1, 8, 0,
5365        );
5366        #[rustfmt::skip]
5367        let expected = _mm256_setr_epi8(
5368            5, 0, 5, 4, 9, 13, 7, 4,
5369            13, 6, 6, 11, 5, 2, 9, 1,
5370            21, 0, 21, 20, 25, 29, 23, 20,
5371            29, 22, 22, 27, 21, 18, 25, 17,
5372        );
5373        let r = _mm256_shuffle_epi8(a, b);
5374        assert_eq_m256i(r, expected);
5375    }
5376
5377    #[simd_test(enable = "avx2")]
5378    unsafe fn test_mm256_permutevar8x32_epi32() {
5379        let a = _mm256_setr_epi32(100, 200, 300, 400, 500, 600, 700, 800);
5380        let b = _mm256_setr_epi32(5, 0, 5, 1, 7, 6, 3, 4);
5381        let expected = _mm256_setr_epi32(600, 100, 600, 200, 800, 700, 400, 500);
5382        let r = _mm256_permutevar8x32_epi32(a, b);
5383        assert_eq_m256i(r, expected);
5384    }
5385
5386    #[simd_test(enable = "avx2")]
5387    unsafe fn test_mm256_permute4x64_epi64() {
5388        let a = _mm256_setr_epi64x(100, 200, 300, 400);
5389        let expected = _mm256_setr_epi64x(400, 100, 200, 100);
5390        let r = _mm256_permute4x64_epi64::<0b00010011>(a);
5391        assert_eq_m256i(r, expected);
5392    }
5393
5394    #[simd_test(enable = "avx2")]
5395    unsafe fn test_mm256_permute2x128_si256() {
5396        let a = _mm256_setr_epi64x(100, 200, 500, 600);
5397        let b = _mm256_setr_epi64x(300, 400, 700, 800);
5398        let r = _mm256_permute2x128_si256::<0b00_01_00_11>(a, b);
5399        let e = _mm256_setr_epi64x(700, 800, 500, 600);
5400        assert_eq_m256i(r, e);
5401    }
5402
5403    #[simd_test(enable = "avx2")]
5404    unsafe fn test_mm256_permute4x64_pd() {
5405        let a = _mm256_setr_pd(1., 2., 3., 4.);
5406        let r = _mm256_permute4x64_pd::<0b00_01_00_11>(a);
5407        let e = _mm256_setr_pd(4., 1., 2., 1.);
5408        assert_eq_m256d(r, e);
5409    }
5410
5411    #[simd_test(enable = "avx2")]
5412    unsafe fn test_mm256_permutevar8x32_ps() {
5413        let a = _mm256_setr_ps(1., 2., 3., 4., 5., 6., 7., 8.);
5414        let b = _mm256_setr_epi32(5, 0, 5, 1, 7, 6, 3, 4);
5415        let r = _mm256_permutevar8x32_ps(a, b);
5416        let e = _mm256_setr_ps(6., 1., 6., 2., 8., 7., 4., 5.);
5417        assert_eq_m256(r, e);
5418    }
5419
5420    #[simd_test(enable = "avx2")]
5421    unsafe fn test_mm_i32gather_epi32() {
5422        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5423        // A multiplier of 4 is word-addressing
5424        let r = _mm_i32gather_epi32::<4>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
5425        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 32, 48));
5426    }
5427
5428    #[simd_test(enable = "avx2")]
5429    unsafe fn test_mm_mask_i32gather_epi32() {
5430        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5431        // A multiplier of 4 is word-addressing
5432        let r = _mm_mask_i32gather_epi32::<4>(
5433            _mm_set1_epi32(256),
5434            arr.as_ptr(),
5435            _mm_setr_epi32(0, 16, 64, 96),
5436            _mm_setr_epi32(-1, -1, -1, 0),
5437        );
5438        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 64, 256));
5439    }
5440
5441    #[simd_test(enable = "avx2")]
5442    unsafe fn test_mm256_i32gather_epi32() {
5443        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5444        // A multiplier of 4 is word-addressing
5445        let r =
5446            _mm256_i32gather_epi32::<4>(arr.as_ptr(), _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4));
5447        assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4));
5448    }
5449
5450    #[simd_test(enable = "avx2")]
5451    unsafe fn test_mm256_mask_i32gather_epi32() {
5452        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5453        // A multiplier of 4 is word-addressing
5454        let r = _mm256_mask_i32gather_epi32::<4>(
5455            _mm256_set1_epi32(256),
5456            arr.as_ptr(),
5457            _mm256_setr_epi32(0, 16, 64, 96, 0, 0, 0, 0),
5458            _mm256_setr_epi32(-1, -1, -1, 0, 0, 0, 0, 0),
5459        );
5460        assert_eq_m256i(r, _mm256_setr_epi32(0, 16, 64, 256, 256, 256, 256, 256));
5461    }
5462
5463    #[simd_test(enable = "avx2")]
5464    unsafe fn test_mm_i32gather_ps() {
5465        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5466        // A multiplier of 4 is word-addressing for f32s
5467        let r = _mm_i32gather_ps::<4>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
5468        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 32.0, 48.0));
5469    }
5470
5471    #[simd_test(enable = "avx2")]
5472    unsafe fn test_mm_mask_i32gather_ps() {
5473        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5474        // A multiplier of 4 is word-addressing for f32s
5475        let r = _mm_mask_i32gather_ps::<4>(
5476            _mm_set1_ps(256.0),
5477            arr.as_ptr(),
5478            _mm_setr_epi32(0, 16, 64, 96),
5479            _mm_setr_ps(-1.0, -1.0, -1.0, 0.0),
5480        );
5481        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 64.0, 256.0));
5482    }
5483
5484    #[simd_test(enable = "avx2")]
5485    unsafe fn test_mm256_i32gather_ps() {
5486        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5487        // A multiplier of 4 is word-addressing for f32s
5488        let r =
5489            _mm256_i32gather_ps::<4>(arr.as_ptr(), _mm256_setr_epi32(0, 16, 32, 48, 1, 2, 3, 4));
5490        assert_eq_m256(r, _mm256_setr_ps(0.0, 16.0, 32.0, 48.0, 1.0, 2.0, 3.0, 4.0));
5491    }
5492
5493    #[simd_test(enable = "avx2")]
5494    unsafe fn test_mm256_mask_i32gather_ps() {
5495        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5496        // A multiplier of 4 is word-addressing for f32s
5497        let r = _mm256_mask_i32gather_ps::<4>(
5498            _mm256_set1_ps(256.0),
5499            arr.as_ptr(),
5500            _mm256_setr_epi32(0, 16, 64, 96, 0, 0, 0, 0),
5501            _mm256_setr_ps(-1.0, -1.0, -1.0, 0.0, 0.0, 0.0, 0.0, 0.0),
5502        );
5503        assert_eq_m256(
5504            r,
5505            _mm256_setr_ps(0.0, 16.0, 64.0, 256.0, 256.0, 256.0, 256.0, 256.0),
5506        );
5507    }
5508
5509    #[simd_test(enable = "avx2")]
5510    unsafe fn test_mm_i32gather_epi64() {
5511        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5512        // A multiplier of 8 is word-addressing for i64s
5513        let r = _mm_i32gather_epi64::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 0, 0));
5514        assert_eq_m128i(r, _mm_setr_epi64x(0, 16));
5515    }
5516
5517    #[simd_test(enable = "avx2")]
5518    unsafe fn test_mm_mask_i32gather_epi64() {
5519        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5520        // A multiplier of 8 is word-addressing for i64s
5521        let r = _mm_mask_i32gather_epi64::<8>(
5522            _mm_set1_epi64x(256),
5523            arr.as_ptr(),
5524            _mm_setr_epi32(16, 16, 16, 16),
5525            _mm_setr_epi64x(-1, 0),
5526        );
5527        assert_eq_m128i(r, _mm_setr_epi64x(16, 256));
5528    }
5529
5530    #[simd_test(enable = "avx2")]
5531    unsafe fn test_mm256_i32gather_epi64() {
5532        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5533        // A multiplier of 8 is word-addressing for i64s
5534        let r = _mm256_i32gather_epi64::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
5535        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 32, 48));
5536    }
5537
5538    #[simd_test(enable = "avx2")]
5539    unsafe fn test_mm256_mask_i32gather_epi64() {
5540        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5541        // A multiplier of 8 is word-addressing for i64s
5542        let r = _mm256_mask_i32gather_epi64::<8>(
5543            _mm256_set1_epi64x(256),
5544            arr.as_ptr(),
5545            _mm_setr_epi32(0, 16, 64, 96),
5546            _mm256_setr_epi64x(-1, -1, -1, 0),
5547        );
5548        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 64, 256));
5549    }
5550
5551    #[simd_test(enable = "avx2")]
5552    unsafe fn test_mm_i32gather_pd() {
5553        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5554        // A multiplier of 8 is word-addressing for f64s
5555        let r = _mm_i32gather_pd::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 0, 0));
5556        assert_eq_m128d(r, _mm_setr_pd(0.0, 16.0));
5557    }
5558
5559    #[simd_test(enable = "avx2")]
5560    unsafe fn test_mm_mask_i32gather_pd() {
5561        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5562        // A multiplier of 8 is word-addressing for f64s
5563        let r = _mm_mask_i32gather_pd::<8>(
5564            _mm_set1_pd(256.0),
5565            arr.as_ptr(),
5566            _mm_setr_epi32(16, 16, 16, 16),
5567            _mm_setr_pd(-1.0, 0.0),
5568        );
5569        assert_eq_m128d(r, _mm_setr_pd(16.0, 256.0));
5570    }
5571
5572    #[simd_test(enable = "avx2")]
5573    unsafe fn test_mm256_i32gather_pd() {
5574        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5575        // A multiplier of 8 is word-addressing for f64s
5576        let r = _mm256_i32gather_pd::<8>(arr.as_ptr(), _mm_setr_epi32(0, 16, 32, 48));
5577        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 32.0, 48.0));
5578    }
5579
5580    #[simd_test(enable = "avx2")]
5581    unsafe fn test_mm256_mask_i32gather_pd() {
5582        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5583        // A multiplier of 8 is word-addressing for f64s
5584        let r = _mm256_mask_i32gather_pd::<8>(
5585            _mm256_set1_pd(256.0),
5586            arr.as_ptr(),
5587            _mm_setr_epi32(0, 16, 64, 96),
5588            _mm256_setr_pd(-1.0, -1.0, -1.0, 0.0),
5589        );
5590        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 64.0, 256.0));
5591    }
5592
5593    #[simd_test(enable = "avx2")]
5594    unsafe fn test_mm_i64gather_epi32() {
5595        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5596        // A multiplier of 4 is word-addressing
5597        let r = _mm_i64gather_epi32::<4>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
5598        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 0, 0));
5599    }
5600
5601    #[simd_test(enable = "avx2")]
5602    unsafe fn test_mm_mask_i64gather_epi32() {
5603        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5604        // A multiplier of 4 is word-addressing
5605        let r = _mm_mask_i64gather_epi32::<4>(
5606            _mm_set1_epi32(256),
5607            arr.as_ptr(),
5608            _mm_setr_epi64x(0, 16),
5609            _mm_setr_epi32(-1, 0, -1, 0),
5610        );
5611        assert_eq_m128i(r, _mm_setr_epi32(0, 256, 0, 0));
5612    }
5613
5614    #[simd_test(enable = "avx2")]
5615    unsafe fn test_mm256_i64gather_epi32() {
5616        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5617        // A multiplier of 4 is word-addressing
5618        let r = _mm256_i64gather_epi32::<4>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
5619        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 32, 48));
5620    }
5621
5622    #[simd_test(enable = "avx2")]
5623    unsafe fn test_mm256_mask_i64gather_epi32() {
5624        let arr: [i32; 128] = core::array::from_fn(|i| i as i32);
5625        // A multiplier of 4 is word-addressing
5626        let r = _mm256_mask_i64gather_epi32::<4>(
5627            _mm_set1_epi32(256),
5628            arr.as_ptr(),
5629            _mm256_setr_epi64x(0, 16, 64, 96),
5630            _mm_setr_epi32(-1, -1, -1, 0),
5631        );
5632        assert_eq_m128i(r, _mm_setr_epi32(0, 16, 64, 256));
5633    }
5634
5635    #[simd_test(enable = "avx2")]
5636    unsafe fn test_mm_i64gather_ps() {
5637        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5638        // A multiplier of 4 is word-addressing for f32s
5639        let r = _mm_i64gather_ps::<4>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
5640        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 0.0, 0.0));
5641    }
5642
5643    #[simd_test(enable = "avx2")]
5644    unsafe fn test_mm_mask_i64gather_ps() {
5645        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5646        // A multiplier of 4 is word-addressing for f32s
5647        let r = _mm_mask_i64gather_ps::<4>(
5648            _mm_set1_ps(256.0),
5649            arr.as_ptr(),
5650            _mm_setr_epi64x(0, 16),
5651            _mm_setr_ps(-1.0, 0.0, -1.0, 0.0),
5652        );
5653        assert_eq_m128(r, _mm_setr_ps(0.0, 256.0, 0.0, 0.0));
5654    }
5655
5656    #[simd_test(enable = "avx2")]
5657    unsafe fn test_mm256_i64gather_ps() {
5658        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5659        // A multiplier of 4 is word-addressing for f32s
5660        let r = _mm256_i64gather_ps::<4>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
5661        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 32.0, 48.0));
5662    }
5663
5664    #[simd_test(enable = "avx2")]
5665    unsafe fn test_mm256_mask_i64gather_ps() {
5666        let arr: [f32; 128] = core::array::from_fn(|i| i as f32);
5667        // A multiplier of 4 is word-addressing for f32s
5668        let r = _mm256_mask_i64gather_ps::<4>(
5669            _mm_set1_ps(256.0),
5670            arr.as_ptr(),
5671            _mm256_setr_epi64x(0, 16, 64, 96),
5672            _mm_setr_ps(-1.0, -1.0, -1.0, 0.0),
5673        );
5674        assert_eq_m128(r, _mm_setr_ps(0.0, 16.0, 64.0, 256.0));
5675    }
5676
5677    #[simd_test(enable = "avx2")]
5678    unsafe fn test_mm_i64gather_epi64() {
5679        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5680        // A multiplier of 8 is word-addressing for i64s
5681        let r = _mm_i64gather_epi64::<8>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
5682        assert_eq_m128i(r, _mm_setr_epi64x(0, 16));
5683    }
5684
5685    #[simd_test(enable = "avx2")]
5686    unsafe fn test_mm_mask_i64gather_epi64() {
5687        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5688        // A multiplier of 8 is word-addressing for i64s
5689        let r = _mm_mask_i64gather_epi64::<8>(
5690            _mm_set1_epi64x(256),
5691            arr.as_ptr(),
5692            _mm_setr_epi64x(16, 16),
5693            _mm_setr_epi64x(-1, 0),
5694        );
5695        assert_eq_m128i(r, _mm_setr_epi64x(16, 256));
5696    }
5697
5698    #[simd_test(enable = "avx2")]
5699    unsafe fn test_mm256_i64gather_epi64() {
5700        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5701        // A multiplier of 8 is word-addressing for i64s
5702        let r = _mm256_i64gather_epi64::<8>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
5703        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 32, 48));
5704    }
5705
5706    #[simd_test(enable = "avx2")]
5707    unsafe fn test_mm256_mask_i64gather_epi64() {
5708        let arr: [i64; 128] = core::array::from_fn(|i| i as i64);
5709        // A multiplier of 8 is word-addressing for i64s
5710        let r = _mm256_mask_i64gather_epi64::<8>(
5711            _mm256_set1_epi64x(256),
5712            arr.as_ptr(),
5713            _mm256_setr_epi64x(0, 16, 64, 96),
5714            _mm256_setr_epi64x(-1, -1, -1, 0),
5715        );
5716        assert_eq_m256i(r, _mm256_setr_epi64x(0, 16, 64, 256));
5717    }
5718
5719    #[simd_test(enable = "avx2")]
5720    unsafe fn test_mm_i64gather_pd() {
5721        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5722        // A multiplier of 8 is word-addressing for f64s
5723        let r = _mm_i64gather_pd::<8>(arr.as_ptr(), _mm_setr_epi64x(0, 16));
5724        assert_eq_m128d(r, _mm_setr_pd(0.0, 16.0));
5725    }
5726
5727    #[simd_test(enable = "avx2")]
5728    unsafe fn test_mm_mask_i64gather_pd() {
5729        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5730        // A multiplier of 8 is word-addressing for f64s
5731        let r = _mm_mask_i64gather_pd::<8>(
5732            _mm_set1_pd(256.0),
5733            arr.as_ptr(),
5734            _mm_setr_epi64x(16, 16),
5735            _mm_setr_pd(-1.0, 0.0),
5736        );
5737        assert_eq_m128d(r, _mm_setr_pd(16.0, 256.0));
5738    }
5739
5740    #[simd_test(enable = "avx2")]
5741    unsafe fn test_mm256_i64gather_pd() {
5742        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5743        // A multiplier of 8 is word-addressing for f64s
5744        let r = _mm256_i64gather_pd::<8>(arr.as_ptr(), _mm256_setr_epi64x(0, 16, 32, 48));
5745        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 32.0, 48.0));
5746    }
5747
5748    #[simd_test(enable = "avx2")]
5749    unsafe fn test_mm256_mask_i64gather_pd() {
5750        let arr: [f64; 128] = core::array::from_fn(|i| i as f64);
5751        // A multiplier of 8 is word-addressing for f64s
5752        let r = _mm256_mask_i64gather_pd::<8>(
5753            _mm256_set1_pd(256.0),
5754            arr.as_ptr(),
5755            _mm256_setr_epi64x(0, 16, 64, 96),
5756            _mm256_setr_pd(-1.0, -1.0, -1.0, 0.0),
5757        );
5758        assert_eq_m256d(r, _mm256_setr_pd(0.0, 16.0, 64.0, 256.0));
5759    }
5760
5761    #[simd_test(enable = "avx2")]
5762    unsafe fn test_mm256_extract_epi8() {
5763        #[rustfmt::skip]
5764        let a = _mm256_setr_epi8(
5765            -1, 1, 2, 3, 4, 5, 6, 7,
5766            8, 9, 10, 11, 12, 13, 14, 15,
5767            16, 17, 18, 19, 20, 21, 22, 23,
5768            24, 25, 26, 27, 28, 29, 30, 31
5769        );
5770        let r1 = _mm256_extract_epi8::<0>(a);
5771        let r2 = _mm256_extract_epi8::<3>(a);
5772        assert_eq!(r1, 0xFF);
5773        assert_eq!(r2, 3);
5774    }
5775
5776    #[simd_test(enable = "avx2")]
5777    unsafe fn test_mm256_extract_epi16() {
5778        #[rustfmt::skip]
5779        let a = _mm256_setr_epi16(
5780            -1, 1, 2, 3, 4, 5, 6, 7,
5781            8, 9, 10, 11, 12, 13, 14, 15,
5782        );
5783        let r1 = _mm256_extract_epi16::<0>(a);
5784        let r2 = _mm256_extract_epi16::<3>(a);
5785        assert_eq!(r1, 0xFFFF);
5786        assert_eq!(r2, 3);
5787    }
5788}