From 9a132fb667def7a6200a0ac57843db1705835f18 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Tue, 16 Jun 2026 22:36:32 +0100 Subject: [PATCH 1/5] Use safe stores in WASM store_interleaved --- fearless_simd/src/generated/wasm.rs | 52 ++++++++++++++++------------- fearless_simd_gen/src/mk_wasm.rs | 15 +++++---- 2 files changed, 37 insertions(+), 30 deletions(-) diff --git a/fearless_simd/src/generated/wasm.rs b/fearless_simd/src/generated/wasm.rs index 2963ca6b..a63f74e7 100644 --- a/fearless_simd/src/generated/wasm.rs +++ b/fearless_simd/src/generated/wasm.rs @@ -5384,12 +5384,13 @@ impl Simd for WasmSimd128 { let out1 = u32x4_shuffle::<2, 6, 3, 7>(v02_lower, v13_lower); let out2 = u32x4_shuffle::<0, 4, 1, 5>(v02_upper, v13_upper); let out3 = u32x4_shuffle::<2, 6, 3, 7>(v02_upper, v13_upper); - unsafe { - v128_store(dest[0 * 4usize..].as_mut_ptr() as *mut v128, out0); - v128_store(dest[1 * 4usize..].as_mut_ptr() as *mut v128, out1); - v128_store(dest[2 * 4usize..].as_mut_ptr() as *mut v128, out2); - v128_store(dest[3 * 4usize..].as_mut_ptr() as *mut v128, out3); - } + let (chunks, []) = dest.as_chunks_mut::<4usize>() else { + unreachable!() + }; + crate::transmute::checked_transmute_store::(out0, &mut chunks[0]); + crate::transmute::checked_transmute_store::(out1, &mut chunks[1]); + crate::transmute::checked_transmute_store::(out2, &mut chunks[2]); + crate::transmute::checked_transmute_store::(out3, &mut chunks[3]); } #[inline(always)] fn reinterpret_u8_f32x16(self, a: f32x16) -> u8x64 { @@ -6018,12 +6019,13 @@ impl Simd for WasmSimd128 { let out3 = u8x16_shuffle::<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>( v02_upper, v13_upper, ); - unsafe { - v128_store(dest[0 * 16usize..].as_mut_ptr() as *mut v128, out0); - v128_store(dest[1 * 16usize..].as_mut_ptr() as *mut v128, out1); - v128_store(dest[2 * 16usize..].as_mut_ptr() as *mut v128, out2); - v128_store(dest[3 * 16usize..].as_mut_ptr() as *mut v128, out3); - } + let (chunks, []) = dest.as_chunks_mut::<16usize>() else { + unreachable!() + }; + crate::transmute::checked_transmute_store::(out0, &mut chunks[0]); + crate::transmute::checked_transmute_store::(out1, &mut chunks[1]); + crate::transmute::checked_transmute_store::(out2, &mut chunks[2]); + crate::transmute::checked_transmute_store::(out3, &mut chunks[3]); } #[inline(always)] fn reinterpret_u32_u8x64(self, a: u8x64) -> u32x16 { @@ -6718,12 +6720,13 @@ impl Simd for WasmSimd128 { let out1 = u16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(v02_lower, v13_lower); let out2 = u16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(v02_upper, v13_upper); let out3 = u16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(v02_upper, v13_upper); - unsafe { - v128_store(dest[0 * 8usize..].as_mut_ptr() as *mut v128, out0); - v128_store(dest[1 * 8usize..].as_mut_ptr() as *mut v128, out1); - v128_store(dest[2 * 8usize..].as_mut_ptr() as *mut v128, out2); - v128_store(dest[3 * 8usize..].as_mut_ptr() as *mut v128, out3); - } + let (chunks, []) = dest.as_chunks_mut::<8usize>() else { + unreachable!() + }; + crate::transmute::checked_transmute_store::(out0, &mut chunks[0]); + crate::transmute::checked_transmute_store::(out1, &mut chunks[1]); + crate::transmute::checked_transmute_store::(out2, &mut chunks[2]); + crate::transmute::checked_transmute_store::(out3, &mut chunks[3]); } #[inline(always)] fn narrow_u16x32(self, a: u16x32) -> u8x32 { @@ -7424,12 +7427,13 @@ impl Simd for WasmSimd128 { let out1 = u32x4_shuffle::<2, 6, 3, 7>(v02_lower, v13_lower); let out2 = u32x4_shuffle::<0, 4, 1, 5>(v02_upper, v13_upper); let out3 = u32x4_shuffle::<2, 6, 3, 7>(v02_upper, v13_upper); - unsafe { - v128_store(dest[0 * 4usize..].as_mut_ptr() as *mut v128, out0); - v128_store(dest[1 * 4usize..].as_mut_ptr() as *mut v128, out1); - v128_store(dest[2 * 4usize..].as_mut_ptr() as *mut v128, out2); - v128_store(dest[3 * 4usize..].as_mut_ptr() as *mut v128, out3); - } + let (chunks, []) = dest.as_chunks_mut::<4usize>() else { + unreachable!() + }; + crate::transmute::checked_transmute_store::(out0, &mut chunks[0]); + crate::transmute::checked_transmute_store::(out1, &mut chunks[1]); + crate::transmute::checked_transmute_store::(out2, &mut chunks[2]); + crate::transmute::checked_transmute_store::(out3, &mut chunks[3]); } #[inline(always)] fn reinterpret_u8_u32x16(self, a: u32x16) -> u8x64 { diff --git a/fearless_simd_gen/src/mk_wasm.rs b/fearless_simd_gen/src/mk_wasm.rs index 1a9d35bf..b8404fbb 100644 --- a/fearless_simd_gen/src/mk_wasm.rs +++ b/fearless_simd_gen/src/mk_wasm.rs @@ -685,6 +685,7 @@ impl Level for WasmSimd128 { } => { assert_eq!(block_count, 4, "only count of 4 is currently supported"); let elems_per_vec = block_size as usize / vec_ty.scalar_bits; + let scalar_ty = vec_ty.scalar.rust(vec_ty.scalar_bits); let (lower_indices, upper_indices, shuffle_fn) = match vec_ty.scalar_bits { 8 => ( @@ -743,12 +744,14 @@ impl Level for WasmSimd128 { let out2 = #shuffle_fn::<#lower_indices>(v02_upper, v13_upper); let out3 = #shuffle_fn::<#upper_indices>(v02_upper, v13_upper); - unsafe { - v128_store(dest[0 * #elems_per_vec..].as_mut_ptr() as *mut v128, out0); - v128_store(dest[1 * #elems_per_vec..].as_mut_ptr() as *mut v128, out1); - v128_store(dest[2 * #elems_per_vec..].as_mut_ptr() as *mut v128, out2); - v128_store(dest[3 * #elems_per_vec..].as_mut_ptr() as *mut v128, out3); - } + let (chunks, []) = dest.as_chunks_mut::<#elems_per_vec>() else { + unreachable!() + }; + + crate::transmute::checked_transmute_store::(out0, &mut chunks[0]); + crate::transmute::checked_transmute_store::(out1, &mut chunks[1]); + crate::transmute::checked_transmute_store::(out2, &mut chunks[2]); + crate::transmute::checked_transmute_store::(out3, &mut chunks[3]); } } } From e0b3b2db37283fd4e9a2d114f34991f4b73212dd Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Tue, 16 Jun 2026 22:43:40 +0100 Subject: [PATCH 2/5] Make x86 load/store_interleaved implementation safe using kernel! and checked_transmute_store --- fearless_simd/src/generated/avx2.rs | 500 +++++++++++++++----------- fearless_simd/src/generated/sse4_2.rs | 500 +++++++++++++++----------- fearless_simd_gen/src/mk_x86.rs | 102 +++--- 3 files changed, 639 insertions(+), 463 deletions(-) diff --git a/fearless_simd/src/generated/avx2.rs b/fearless_simd/src/generated/avx2.rs index 216c6562..fb22baec 100644 --- a/fearless_simd/src/generated/avx2.rs +++ b/fearless_simd/src/generated/avx2.rs @@ -8605,55 +8605,78 @@ impl Simd for Avx2 { } #[inline(always)] fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16 { - let (chunks, []) = src.as_chunks::<4usize>() else { - unreachable!() - }; - let v0: __m128 = - crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[0]); - let v1: __m128 = - crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[1]); - let v2: __m128 = - crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[2]); - let v3: __m128 = - crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[3]); - unsafe { - let tmp0 = _mm_unpacklo_ps(v0, v1); - let tmp1 = _mm_unpackhi_ps(v0, v1); - let tmp2 = _mm_unpacklo_ps(v2, v3); - let tmp3 = _mm_unpackhi_ps(v2, v3); - let out0 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); - let out1 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); - let out2 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3))); - let out3 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3))); - self.combine_f32x8( - self.combine_f32x4(out0.simd_into(self), out1.simd_into(self)), - self.combine_f32x4(out2.simd_into(self), out3.simd_into(self)), - ) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, src: &[f32; 16usize]) -> f32x16 { + let (chunks, []) = src.as_chunks::<4usize>() else { + unreachable!() + }; + let v0: __m128 = + crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[0]); + let v1: __m128 = + crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[1]); + let v2: __m128 = + crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[2]); + let v3: __m128 = + crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[3]); + let tmp0 = _mm_unpacklo_ps(v0, v1); + let tmp1 = _mm_unpackhi_ps(v0, v1); + let tmp2 = _mm_unpacklo_ps(v2, v3); + let tmp3 = _mm_unpackhi_ps(v2, v3); + let out0 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); + let out1 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); + let out2 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3))); + let out3 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3))); + token.combine_f32x8( + token.combine_f32x4(out0.simd_into(token), out1.simd_into(token)), + token.combine_f32x4(out2.simd_into(token), out3.simd_into(token)), + ) + } + ); + kernel(self, src) } #[inline(always)] fn store_interleaved_128_f32x16(self, a: f32x16, dest: &mut [f32; 16usize]) -> () { - let (v01, v23) = self.split_f32x16(a); - let (v0, v1) = self.split_f32x8(v01); - let (v2, v3) = self.split_f32x8(v23); - let v0 = v0.into(); - let v1 = v1.into(); - let v2 = v2.into(); - let v3 = v3.into(); - unsafe { - let tmp0 = _mm_unpacklo_ps(v0, v1); - let tmp1 = _mm_unpackhi_ps(v0, v1); - let tmp2 = _mm_unpacklo_ps(v2, v3); - let tmp3 = _mm_unpackhi_ps(v2, v3); - let out0 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); - let out1 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); - let out2 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3))); - let out3 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3))); - _mm_storeu_ps(dest.as_mut_ptr() as *mut _, out0); - _mm_storeu_ps(dest.as_mut_ptr().add(4usize) as *mut _, out1); - _mm_storeu_ps(dest.as_mut_ptr().add(2 * 4usize) as *mut _, out2); - _mm_storeu_ps(dest.as_mut_ptr().add(3 * 4usize) as *mut _, out3); - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: f32x16, dest: &mut [f32; 16usize]) -> () { + let (v01, v23) = token.split_f32x16(a); + let (v0, v1) = token.split_f32x8(v01); + let (v2, v3) = token.split_f32x8(v23); + let v0 = v0.into(); + let v1 = v1.into(); + let v2 = v2.into(); + let v3 = v3.into(); + let tmp0 = _mm_unpacklo_ps(v0, v1); + let tmp1 = _mm_unpackhi_ps(v0, v1); + let tmp2 = _mm_unpacklo_ps(v2, v3); + let tmp3 = _mm_unpackhi_ps(v2, v3); + let out0 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); + let out1 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); + let out2 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3))); + let out3 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3))); + let (chunks, []) = dest.as_chunks_mut::<4usize>() else { + unreachable!() + }; + crate::transmute::checked_transmute_store::<__m128, [f32; 4usize]>( + out0, + &mut chunks[0], + ); + crate::transmute::checked_transmute_store::<__m128, [f32; 4usize]>( + out1, + &mut chunks[1], + ); + crate::transmute::checked_transmute_store::<__m128, [f32; 4usize]>( + out2, + &mut chunks[2], + ); + crate::transmute::checked_transmute_store::<__m128, [f32; 4usize]>( + out3, + &mut chunks[3], + ); + } + ); + kernel(self, a, dest) } #[inline(always)] fn reinterpret_u8_f32x16(self, a: f32x16) -> u8x64 { @@ -9222,65 +9245,88 @@ impl Simd for Avx2 { } #[inline(always)] fn load_interleaved_128_u8x64(self, src: &[u8; 64usize]) -> u8x64 { - let (chunks, []) = src.as_chunks::<16usize>() else { - unreachable!() - }; - let v0: __m128i = - crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[0]); - let v1: __m128i = - crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[1]); - let v2: __m128i = - crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[2]); - let v3: __m128i = - crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[3]); - unsafe { - let mask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); - let v0 = _mm_shuffle_epi8(v0, mask); - let v1 = _mm_shuffle_epi8(v1, mask); - let v2 = _mm_shuffle_epi8(v2, mask); - let v3 = _mm_shuffle_epi8(v3, mask); - let tmp0 = _mm_unpacklo_epi32(v0, v1); - let tmp1 = _mm_unpackhi_epi32(v0, v1); - let tmp2 = _mm_unpacklo_epi32(v2, v3); - let tmp3 = _mm_unpackhi_epi32(v2, v3); - let out0 = _mm_unpacklo_epi64(tmp0, tmp2); - let out1 = _mm_unpackhi_epi64(tmp0, tmp2); - let out2 = _mm_unpacklo_epi64(tmp1, tmp3); - let out3 = _mm_unpackhi_epi64(tmp1, tmp3); - self.combine_u8x32( - self.combine_u8x16(out0.simd_into(self), out1.simd_into(self)), - self.combine_u8x16(out2.simd_into(self), out3.simd_into(self)), - ) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, src: &[u8; 64usize]) -> u8x64 { + let (chunks, []) = src.as_chunks::<16usize>() else { + unreachable!() + }; + let v0: __m128i = + crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[0]); + let v1: __m128i = + crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[1]); + let v2: __m128i = + crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[2]); + let v3: __m128i = + crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[3]); + let mask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); + let v0 = _mm_shuffle_epi8(v0, mask); + let v1 = _mm_shuffle_epi8(v1, mask); + let v2 = _mm_shuffle_epi8(v2, mask); + let v3 = _mm_shuffle_epi8(v3, mask); + let tmp0 = _mm_unpacklo_epi32(v0, v1); + let tmp1 = _mm_unpackhi_epi32(v0, v1); + let tmp2 = _mm_unpacklo_epi32(v2, v3); + let tmp3 = _mm_unpackhi_epi32(v2, v3); + let out0 = _mm_unpacklo_epi64(tmp0, tmp2); + let out1 = _mm_unpackhi_epi64(tmp0, tmp2); + let out2 = _mm_unpacklo_epi64(tmp1, tmp3); + let out3 = _mm_unpackhi_epi64(tmp1, tmp3); + token.combine_u8x32( + token.combine_u8x16(out0.simd_into(token), out1.simd_into(token)), + token.combine_u8x16(out2.simd_into(token), out3.simd_into(token)), + ) + } + ); + kernel(self, src) } #[inline(always)] fn store_interleaved_128_u8x64(self, a: u8x64, dest: &mut [u8; 64usize]) -> () { - let (v01, v23) = self.split_u8x64(a); - let (v0, v1) = self.split_u8x32(v01); - let (v2, v3) = self.split_u8x32(v23); - let v0 = v0.into(); - let v1 = v1.into(); - let v2 = v2.into(); - let v3 = v3.into(); - unsafe { - let tmp0 = _mm_unpacklo_epi32(v0, v1); - let tmp1 = _mm_unpackhi_epi32(v0, v1); - let tmp2 = _mm_unpacklo_epi32(v2, v3); - let tmp3 = _mm_unpackhi_epi32(v2, v3); - let out0 = _mm_unpacklo_epi64(tmp0, tmp2); - let out1 = _mm_unpackhi_epi64(tmp0, tmp2); - let out2 = _mm_unpacklo_epi64(tmp1, tmp3); - let out3 = _mm_unpackhi_epi64(tmp1, tmp3); - let mask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); - let out0 = _mm_shuffle_epi8(out0, mask); - let out1 = _mm_shuffle_epi8(out1, mask); - let out2 = _mm_shuffle_epi8(out2, mask); - let out3 = _mm_shuffle_epi8(out3, mask); - _mm_storeu_si128(dest.as_mut_ptr() as *mut _, out0); - _mm_storeu_si128(dest.as_mut_ptr().add(16usize) as *mut _, out1); - _mm_storeu_si128(dest.as_mut_ptr().add(2 * 16usize) as *mut _, out2); - _mm_storeu_si128(dest.as_mut_ptr().add(3 * 16usize) as *mut _, out3); - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u8x64, dest: &mut [u8; 64usize]) -> () { + let (v01, v23) = token.split_u8x64(a); + let (v0, v1) = token.split_u8x32(v01); + let (v2, v3) = token.split_u8x32(v23); + let v0 = v0.into(); + let v1 = v1.into(); + let v2 = v2.into(); + let v3 = v3.into(); + let tmp0 = _mm_unpacklo_epi32(v0, v1); + let tmp1 = _mm_unpackhi_epi32(v0, v1); + let tmp2 = _mm_unpacklo_epi32(v2, v3); + let tmp3 = _mm_unpackhi_epi32(v2, v3); + let out0 = _mm_unpacklo_epi64(tmp0, tmp2); + let out1 = _mm_unpackhi_epi64(tmp0, tmp2); + let out2 = _mm_unpacklo_epi64(tmp1, tmp3); + let out3 = _mm_unpackhi_epi64(tmp1, tmp3); + let mask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); + let out0 = _mm_shuffle_epi8(out0, mask); + let out1 = _mm_shuffle_epi8(out1, mask); + let out2 = _mm_shuffle_epi8(out2, mask); + let out3 = _mm_shuffle_epi8(out3, mask); + let (chunks, []) = dest.as_chunks_mut::<16usize>() else { + unreachable!() + }; + crate::transmute::checked_transmute_store::<__m128i, [u8; 16usize]>( + out0, + &mut chunks[0], + ); + crate::transmute::checked_transmute_store::<__m128i, [u8; 16usize]>( + out1, + &mut chunks[1], + ); + crate::transmute::checked_transmute_store::<__m128i, [u8; 16usize]>( + out2, + &mut chunks[2], + ); + crate::transmute::checked_transmute_store::<__m128i, [u8; 16usize]>( + out3, + &mut chunks[3], + ); + } + ); + kernel(self, a, dest) } #[inline(always)] fn reinterpret_u32_u8x64(self, a: u8x64) -> u32x16 { @@ -9974,65 +10020,88 @@ impl Simd for Avx2 { } #[inline(always)] fn load_interleaved_128_u16x32(self, src: &[u16; 32usize]) -> u16x32 { - let (chunks, []) = src.as_chunks::<8usize>() else { - unreachable!() - }; - let v0: __m128i = - crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[0]); - let v1: __m128i = - crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[1]); - let v2: __m128i = - crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[2]); - let v3: __m128i = - crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[3]); - unsafe { - let mask = _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15); - let v0 = _mm_shuffle_epi8(v0, mask); - let v1 = _mm_shuffle_epi8(v1, mask); - let v2 = _mm_shuffle_epi8(v2, mask); - let v3 = _mm_shuffle_epi8(v3, mask); - let tmp0 = _mm_unpacklo_epi32(v0, v1); - let tmp1 = _mm_unpackhi_epi32(v0, v1); - let tmp2 = _mm_unpacklo_epi32(v2, v3); - let tmp3 = _mm_unpackhi_epi32(v2, v3); - let out0 = _mm_unpacklo_epi64(tmp0, tmp2); - let out1 = _mm_unpackhi_epi64(tmp0, tmp2); - let out2 = _mm_unpacklo_epi64(tmp1, tmp3); - let out3 = _mm_unpackhi_epi64(tmp1, tmp3); - self.combine_u16x16( - self.combine_u16x8(out0.simd_into(self), out1.simd_into(self)), - self.combine_u16x8(out2.simd_into(self), out3.simd_into(self)), - ) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, src: &[u16; 32usize]) -> u16x32 { + let (chunks, []) = src.as_chunks::<8usize>() else { + unreachable!() + }; + let v0: __m128i = + crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[0]); + let v1: __m128i = + crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[1]); + let v2: __m128i = + crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[2]); + let v3: __m128i = + crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[3]); + let mask = _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15); + let v0 = _mm_shuffle_epi8(v0, mask); + let v1 = _mm_shuffle_epi8(v1, mask); + let v2 = _mm_shuffle_epi8(v2, mask); + let v3 = _mm_shuffle_epi8(v3, mask); + let tmp0 = _mm_unpacklo_epi32(v0, v1); + let tmp1 = _mm_unpackhi_epi32(v0, v1); + let tmp2 = _mm_unpacklo_epi32(v2, v3); + let tmp3 = _mm_unpackhi_epi32(v2, v3); + let out0 = _mm_unpacklo_epi64(tmp0, tmp2); + let out1 = _mm_unpackhi_epi64(tmp0, tmp2); + let out2 = _mm_unpacklo_epi64(tmp1, tmp3); + let out3 = _mm_unpackhi_epi64(tmp1, tmp3); + token.combine_u16x16( + token.combine_u16x8(out0.simd_into(token), out1.simd_into(token)), + token.combine_u16x8(out2.simd_into(token), out3.simd_into(token)), + ) + } + ); + kernel(self, src) } #[inline(always)] fn store_interleaved_128_u16x32(self, a: u16x32, dest: &mut [u16; 32usize]) -> () { - let (v01, v23) = self.split_u16x32(a); - let (v0, v1) = self.split_u16x16(v01); - let (v2, v3) = self.split_u16x16(v23); - let v0 = v0.into(); - let v1 = v1.into(); - let v2 = v2.into(); - let v3 = v3.into(); - unsafe { - let tmp0 = _mm_unpacklo_epi32(v0, v1); - let tmp1 = _mm_unpackhi_epi32(v0, v1); - let tmp2 = _mm_unpacklo_epi32(v2, v3); - let tmp3 = _mm_unpackhi_epi32(v2, v3); - let out0 = _mm_unpacklo_epi64(tmp0, tmp2); - let out1 = _mm_unpackhi_epi64(tmp0, tmp2); - let out2 = _mm_unpacklo_epi64(tmp1, tmp3); - let out3 = _mm_unpackhi_epi64(tmp1, tmp3); - let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); - let out0 = _mm_shuffle_epi8(out0, mask); - let out1 = _mm_shuffle_epi8(out1, mask); - let out2 = _mm_shuffle_epi8(out2, mask); - let out3 = _mm_shuffle_epi8(out3, mask); - _mm_storeu_si128(dest.as_mut_ptr() as *mut _, out0); - _mm_storeu_si128(dest.as_mut_ptr().add(8usize) as *mut _, out1); - _mm_storeu_si128(dest.as_mut_ptr().add(2 * 8usize) as *mut _, out2); - _mm_storeu_si128(dest.as_mut_ptr().add(3 * 8usize) as *mut _, out3); - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u16x32, dest: &mut [u16; 32usize]) -> () { + let (v01, v23) = token.split_u16x32(a); + let (v0, v1) = token.split_u16x16(v01); + let (v2, v3) = token.split_u16x16(v23); + let v0 = v0.into(); + let v1 = v1.into(); + let v2 = v2.into(); + let v3 = v3.into(); + let tmp0 = _mm_unpacklo_epi32(v0, v1); + let tmp1 = _mm_unpackhi_epi32(v0, v1); + let tmp2 = _mm_unpacklo_epi32(v2, v3); + let tmp3 = _mm_unpackhi_epi32(v2, v3); + let out0 = _mm_unpacklo_epi64(tmp0, tmp2); + let out1 = _mm_unpackhi_epi64(tmp0, tmp2); + let out2 = _mm_unpacklo_epi64(tmp1, tmp3); + let out3 = _mm_unpackhi_epi64(tmp1, tmp3); + let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); + let out0 = _mm_shuffle_epi8(out0, mask); + let out1 = _mm_shuffle_epi8(out1, mask); + let out2 = _mm_shuffle_epi8(out2, mask); + let out3 = _mm_shuffle_epi8(out3, mask); + let (chunks, []) = dest.as_chunks_mut::<8usize>() else { + unreachable!() + }; + crate::transmute::checked_transmute_store::<__m128i, [u16; 8usize]>( + out0, + &mut chunks[0], + ); + crate::transmute::checked_transmute_store::<__m128i, [u16; 8usize]>( + out1, + &mut chunks[1], + ); + crate::transmute::checked_transmute_store::<__m128i, [u16; 8usize]>( + out2, + &mut chunks[2], + ); + crate::transmute::checked_transmute_store::<__m128i, [u16; 8usize]>( + out3, + &mut chunks[3], + ); + } + ); + kernel(self, a, dest) } #[inline(always)] fn narrow_u16x32(self, a: u16x32) -> u8x32 { @@ -10718,55 +10787,78 @@ impl Simd for Avx2 { } #[inline(always)] fn load_interleaved_128_u32x16(self, src: &[u32; 16usize]) -> u32x16 { - let (chunks, []) = src.as_chunks::<4usize>() else { - unreachable!() - }; - let v0: __m128i = - crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[0]); - let v1: __m128i = - crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[1]); - let v2: __m128i = - crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[2]); - let v3: __m128i = - crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[3]); - unsafe { - let tmp0 = _mm_unpacklo_epi32(v0, v1); - let tmp1 = _mm_unpackhi_epi32(v0, v1); - let tmp2 = _mm_unpacklo_epi32(v2, v3); - let tmp3 = _mm_unpackhi_epi32(v2, v3); - let out0 = _mm_unpacklo_epi64(tmp0, tmp2); - let out1 = _mm_unpackhi_epi64(tmp0, tmp2); - let out2 = _mm_unpacklo_epi64(tmp1, tmp3); - let out3 = _mm_unpackhi_epi64(tmp1, tmp3); - self.combine_u32x8( - self.combine_u32x4(out0.simd_into(self), out1.simd_into(self)), - self.combine_u32x4(out2.simd_into(self), out3.simd_into(self)), - ) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, src: &[u32; 16usize]) -> u32x16 { + let (chunks, []) = src.as_chunks::<4usize>() else { + unreachable!() + }; + let v0: __m128i = + crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[0]); + let v1: __m128i = + crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[1]); + let v2: __m128i = + crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[2]); + let v3: __m128i = + crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[3]); + let tmp0 = _mm_unpacklo_epi32(v0, v1); + let tmp1 = _mm_unpackhi_epi32(v0, v1); + let tmp2 = _mm_unpacklo_epi32(v2, v3); + let tmp3 = _mm_unpackhi_epi32(v2, v3); + let out0 = _mm_unpacklo_epi64(tmp0, tmp2); + let out1 = _mm_unpackhi_epi64(tmp0, tmp2); + let out2 = _mm_unpacklo_epi64(tmp1, tmp3); + let out3 = _mm_unpackhi_epi64(tmp1, tmp3); + token.combine_u32x8( + token.combine_u32x4(out0.simd_into(token), out1.simd_into(token)), + token.combine_u32x4(out2.simd_into(token), out3.simd_into(token)), + ) + } + ); + kernel(self, src) } #[inline(always)] fn store_interleaved_128_u32x16(self, a: u32x16, dest: &mut [u32; 16usize]) -> () { - let (v01, v23) = self.split_u32x16(a); - let (v0, v1) = self.split_u32x8(v01); - let (v2, v3) = self.split_u32x8(v23); - let v0 = v0.into(); - let v1 = v1.into(); - let v2 = v2.into(); - let v3 = v3.into(); - unsafe { - let tmp0 = _mm_unpacklo_epi32(v0, v1); - let tmp1 = _mm_unpackhi_epi32(v0, v1); - let tmp2 = _mm_unpacklo_epi32(v2, v3); - let tmp3 = _mm_unpackhi_epi32(v2, v3); - let out0 = _mm_unpacklo_epi64(tmp0, tmp2); - let out1 = _mm_unpackhi_epi64(tmp0, tmp2); - let out2 = _mm_unpacklo_epi64(tmp1, tmp3); - let out3 = _mm_unpackhi_epi64(tmp1, tmp3); - _mm_storeu_si128(dest.as_mut_ptr() as *mut _, out0); - _mm_storeu_si128(dest.as_mut_ptr().add(4usize) as *mut _, out1); - _mm_storeu_si128(dest.as_mut_ptr().add(2 * 4usize) as *mut _, out2); - _mm_storeu_si128(dest.as_mut_ptr().add(3 * 4usize) as *mut _, out3); - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Avx2, a: u32x16, dest: &mut [u32; 16usize]) -> () { + let (v01, v23) = token.split_u32x16(a); + let (v0, v1) = token.split_u32x8(v01); + let (v2, v3) = token.split_u32x8(v23); + let v0 = v0.into(); + let v1 = v1.into(); + let v2 = v2.into(); + let v3 = v3.into(); + let tmp0 = _mm_unpacklo_epi32(v0, v1); + let tmp1 = _mm_unpackhi_epi32(v0, v1); + let tmp2 = _mm_unpacklo_epi32(v2, v3); + let tmp3 = _mm_unpackhi_epi32(v2, v3); + let out0 = _mm_unpacklo_epi64(tmp0, tmp2); + let out1 = _mm_unpackhi_epi64(tmp0, tmp2); + let out2 = _mm_unpacklo_epi64(tmp1, tmp3); + let out3 = _mm_unpackhi_epi64(tmp1, tmp3); + let (chunks, []) = dest.as_chunks_mut::<4usize>() else { + unreachable!() + }; + crate::transmute::checked_transmute_store::<__m128i, [u32; 4usize]>( + out0, + &mut chunks[0], + ); + crate::transmute::checked_transmute_store::<__m128i, [u32; 4usize]>( + out1, + &mut chunks[1], + ); + crate::transmute::checked_transmute_store::<__m128i, [u32; 4usize]>( + out2, + &mut chunks[2], + ); + crate::transmute::checked_transmute_store::<__m128i, [u32; 4usize]>( + out3, + &mut chunks[3], + ); + } + ); + kernel(self, a, dest) } #[inline(always)] fn reinterpret_u8_u32x16(self, a: u32x16) -> u8x64 { diff --git a/fearless_simd/src/generated/sse4_2.rs b/fearless_simd/src/generated/sse4_2.rs index 47b81a4d..7966c826 100644 --- a/fearless_simd/src/generated/sse4_2.rs +++ b/fearless_simd/src/generated/sse4_2.rs @@ -6950,55 +6950,78 @@ impl Simd for Sse4_2 { } #[inline(always)] fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16 { - let (chunks, []) = src.as_chunks::<4usize>() else { - unreachable!() - }; - let v0: __m128 = - crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[0]); - let v1: __m128 = - crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[1]); - let v2: __m128 = - crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[2]); - let v3: __m128 = - crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[3]); - unsafe { - let tmp0 = _mm_unpacklo_ps(v0, v1); - let tmp1 = _mm_unpackhi_ps(v0, v1); - let tmp2 = _mm_unpacklo_ps(v2, v3); - let tmp3 = _mm_unpackhi_ps(v2, v3); - let out0 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); - let out1 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); - let out2 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3))); - let out3 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3))); - self.combine_f32x8( - self.combine_f32x4(out0.simd_into(self), out1.simd_into(self)), - self.combine_f32x4(out2.simd_into(self), out3.simd_into(self)), - ) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, src: &[f32; 16usize]) -> f32x16 { + let (chunks, []) = src.as_chunks::<4usize>() else { + unreachable!() + }; + let v0: __m128 = + crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[0]); + let v1: __m128 = + crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[1]); + let v2: __m128 = + crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[2]); + let v3: __m128 = + crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[3]); + let tmp0 = _mm_unpacklo_ps(v0, v1); + let tmp1 = _mm_unpackhi_ps(v0, v1); + let tmp2 = _mm_unpacklo_ps(v2, v3); + let tmp3 = _mm_unpackhi_ps(v2, v3); + let out0 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); + let out1 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); + let out2 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3))); + let out3 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3))); + token.combine_f32x8( + token.combine_f32x4(out0.simd_into(token), out1.simd_into(token)), + token.combine_f32x4(out2.simd_into(token), out3.simd_into(token)), + ) + } + ); + kernel(self, src) } #[inline(always)] fn store_interleaved_128_f32x16(self, a: f32x16, dest: &mut [f32; 16usize]) -> () { - let (v01, v23) = self.split_f32x16(a); - let (v0, v1) = self.split_f32x8(v01); - let (v2, v3) = self.split_f32x8(v23); - let v0 = v0.into(); - let v1 = v1.into(); - let v2 = v2.into(); - let v3 = v3.into(); - unsafe { - let tmp0 = _mm_unpacklo_ps(v0, v1); - let tmp1 = _mm_unpackhi_ps(v0, v1); - let tmp2 = _mm_unpacklo_ps(v2, v3); - let tmp3 = _mm_unpackhi_ps(v2, v3); - let out0 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); - let out1 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); - let out2 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3))); - let out3 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3))); - _mm_storeu_ps(dest.as_mut_ptr() as *mut _, out0); - _mm_storeu_ps(dest.as_mut_ptr().add(4usize) as *mut _, out1); - _mm_storeu_ps(dest.as_mut_ptr().add(2 * 4usize) as *mut _, out2); - _mm_storeu_ps(dest.as_mut_ptr().add(3 * 4usize) as *mut _, out3); - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: f32x16, dest: &mut [f32; 16usize]) -> () { + let (v01, v23) = token.split_f32x16(a); + let (v0, v1) = token.split_f32x8(v01); + let (v2, v3) = token.split_f32x8(v23); + let v0 = v0.into(); + let v1 = v1.into(); + let v2 = v2.into(); + let v3 = v3.into(); + let tmp0 = _mm_unpacklo_ps(v0, v1); + let tmp1 = _mm_unpackhi_ps(v0, v1); + let tmp2 = _mm_unpacklo_ps(v2, v3); + let tmp3 = _mm_unpackhi_ps(v2, v3); + let out0 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); + let out1 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2))); + let out2 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3))); + let out3 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3))); + let (chunks, []) = dest.as_chunks_mut::<4usize>() else { + unreachable!() + }; + crate::transmute::checked_transmute_store::<__m128, [f32; 4usize]>( + out0, + &mut chunks[0], + ); + crate::transmute::checked_transmute_store::<__m128, [f32; 4usize]>( + out1, + &mut chunks[1], + ); + crate::transmute::checked_transmute_store::<__m128, [f32; 4usize]>( + out2, + &mut chunks[2], + ); + crate::transmute::checked_transmute_store::<__m128, [f32; 4usize]>( + out3, + &mut chunks[3], + ); + } + ); + kernel(self, a, dest) } #[inline(always)] fn reinterpret_u8_f32x16(self, a: f32x16) -> u8x64 { @@ -7567,65 +7590,88 @@ impl Simd for Sse4_2 { } #[inline(always)] fn load_interleaved_128_u8x64(self, src: &[u8; 64usize]) -> u8x64 { - let (chunks, []) = src.as_chunks::<16usize>() else { - unreachable!() - }; - let v0: __m128i = - crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[0]); - let v1: __m128i = - crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[1]); - let v2: __m128i = - crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[2]); - let v3: __m128i = - crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[3]); - unsafe { - let mask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); - let v0 = _mm_shuffle_epi8(v0, mask); - let v1 = _mm_shuffle_epi8(v1, mask); - let v2 = _mm_shuffle_epi8(v2, mask); - let v3 = _mm_shuffle_epi8(v3, mask); - let tmp0 = _mm_unpacklo_epi32(v0, v1); - let tmp1 = _mm_unpackhi_epi32(v0, v1); - let tmp2 = _mm_unpacklo_epi32(v2, v3); - let tmp3 = _mm_unpackhi_epi32(v2, v3); - let out0 = _mm_unpacklo_epi64(tmp0, tmp2); - let out1 = _mm_unpackhi_epi64(tmp0, tmp2); - let out2 = _mm_unpacklo_epi64(tmp1, tmp3); - let out3 = _mm_unpackhi_epi64(tmp1, tmp3); - self.combine_u8x32( - self.combine_u8x16(out0.simd_into(self), out1.simd_into(self)), - self.combine_u8x16(out2.simd_into(self), out3.simd_into(self)), - ) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, src: &[u8; 64usize]) -> u8x64 { + let (chunks, []) = src.as_chunks::<16usize>() else { + unreachable!() + }; + let v0: __m128i = + crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[0]); + let v1: __m128i = + crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[1]); + let v2: __m128i = + crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[2]); + let v3: __m128i = + crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[3]); + let mask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); + let v0 = _mm_shuffle_epi8(v0, mask); + let v1 = _mm_shuffle_epi8(v1, mask); + let v2 = _mm_shuffle_epi8(v2, mask); + let v3 = _mm_shuffle_epi8(v3, mask); + let tmp0 = _mm_unpacklo_epi32(v0, v1); + let tmp1 = _mm_unpackhi_epi32(v0, v1); + let tmp2 = _mm_unpacklo_epi32(v2, v3); + let tmp3 = _mm_unpackhi_epi32(v2, v3); + let out0 = _mm_unpacklo_epi64(tmp0, tmp2); + let out1 = _mm_unpackhi_epi64(tmp0, tmp2); + let out2 = _mm_unpacklo_epi64(tmp1, tmp3); + let out3 = _mm_unpackhi_epi64(tmp1, tmp3); + token.combine_u8x32( + token.combine_u8x16(out0.simd_into(token), out1.simd_into(token)), + token.combine_u8x16(out2.simd_into(token), out3.simd_into(token)), + ) + } + ); + kernel(self, src) } #[inline(always)] fn store_interleaved_128_u8x64(self, a: u8x64, dest: &mut [u8; 64usize]) -> () { - let (v01, v23) = self.split_u8x64(a); - let (v0, v1) = self.split_u8x32(v01); - let (v2, v3) = self.split_u8x32(v23); - let v0 = v0.into(); - let v1 = v1.into(); - let v2 = v2.into(); - let v3 = v3.into(); - unsafe { - let tmp0 = _mm_unpacklo_epi32(v0, v1); - let tmp1 = _mm_unpackhi_epi32(v0, v1); - let tmp2 = _mm_unpacklo_epi32(v2, v3); - let tmp3 = _mm_unpackhi_epi32(v2, v3); - let out0 = _mm_unpacklo_epi64(tmp0, tmp2); - let out1 = _mm_unpackhi_epi64(tmp0, tmp2); - let out2 = _mm_unpacklo_epi64(tmp1, tmp3); - let out3 = _mm_unpackhi_epi64(tmp1, tmp3); - let mask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); - let out0 = _mm_shuffle_epi8(out0, mask); - let out1 = _mm_shuffle_epi8(out1, mask); - let out2 = _mm_shuffle_epi8(out2, mask); - let out3 = _mm_shuffle_epi8(out3, mask); - _mm_storeu_si128(dest.as_mut_ptr() as *mut _, out0); - _mm_storeu_si128(dest.as_mut_ptr().add(16usize) as *mut _, out1); - _mm_storeu_si128(dest.as_mut_ptr().add(2 * 16usize) as *mut _, out2); - _mm_storeu_si128(dest.as_mut_ptr().add(3 * 16usize) as *mut _, out3); - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u8x64, dest: &mut [u8; 64usize]) -> () { + let (v01, v23) = token.split_u8x64(a); + let (v0, v1) = token.split_u8x32(v01); + let (v2, v3) = token.split_u8x32(v23); + let v0 = v0.into(); + let v1 = v1.into(); + let v2 = v2.into(); + let v3 = v3.into(); + let tmp0 = _mm_unpacklo_epi32(v0, v1); + let tmp1 = _mm_unpackhi_epi32(v0, v1); + let tmp2 = _mm_unpacklo_epi32(v2, v3); + let tmp3 = _mm_unpackhi_epi32(v2, v3); + let out0 = _mm_unpacklo_epi64(tmp0, tmp2); + let out1 = _mm_unpackhi_epi64(tmp0, tmp2); + let out2 = _mm_unpacklo_epi64(tmp1, tmp3); + let out3 = _mm_unpackhi_epi64(tmp1, tmp3); + let mask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15); + let out0 = _mm_shuffle_epi8(out0, mask); + let out1 = _mm_shuffle_epi8(out1, mask); + let out2 = _mm_shuffle_epi8(out2, mask); + let out3 = _mm_shuffle_epi8(out3, mask); + let (chunks, []) = dest.as_chunks_mut::<16usize>() else { + unreachable!() + }; + crate::transmute::checked_transmute_store::<__m128i, [u8; 16usize]>( + out0, + &mut chunks[0], + ); + crate::transmute::checked_transmute_store::<__m128i, [u8; 16usize]>( + out1, + &mut chunks[1], + ); + crate::transmute::checked_transmute_store::<__m128i, [u8; 16usize]>( + out2, + &mut chunks[2], + ); + crate::transmute::checked_transmute_store::<__m128i, [u8; 16usize]>( + out3, + &mut chunks[3], + ); + } + ); + kernel(self, a, dest) } #[inline(always)] fn reinterpret_u32_u8x64(self, a: u8x64) -> u32x16 { @@ -8325,65 +8371,88 @@ impl Simd for Sse4_2 { } #[inline(always)] fn load_interleaved_128_u16x32(self, src: &[u16; 32usize]) -> u16x32 { - let (chunks, []) = src.as_chunks::<8usize>() else { - unreachable!() - }; - let v0: __m128i = - crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[0]); - let v1: __m128i = - crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[1]); - let v2: __m128i = - crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[2]); - let v3: __m128i = - crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[3]); - unsafe { - let mask = _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15); - let v0 = _mm_shuffle_epi8(v0, mask); - let v1 = _mm_shuffle_epi8(v1, mask); - let v2 = _mm_shuffle_epi8(v2, mask); - let v3 = _mm_shuffle_epi8(v3, mask); - let tmp0 = _mm_unpacklo_epi32(v0, v1); - let tmp1 = _mm_unpackhi_epi32(v0, v1); - let tmp2 = _mm_unpacklo_epi32(v2, v3); - let tmp3 = _mm_unpackhi_epi32(v2, v3); - let out0 = _mm_unpacklo_epi64(tmp0, tmp2); - let out1 = _mm_unpackhi_epi64(tmp0, tmp2); - let out2 = _mm_unpacklo_epi64(tmp1, tmp3); - let out3 = _mm_unpackhi_epi64(tmp1, tmp3); - self.combine_u16x16( - self.combine_u16x8(out0.simd_into(self), out1.simd_into(self)), - self.combine_u16x8(out2.simd_into(self), out3.simd_into(self)), - ) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, src: &[u16; 32usize]) -> u16x32 { + let (chunks, []) = src.as_chunks::<8usize>() else { + unreachable!() + }; + let v0: __m128i = + crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[0]); + let v1: __m128i = + crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[1]); + let v2: __m128i = + crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[2]); + let v3: __m128i = + crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[3]); + let mask = _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15); + let v0 = _mm_shuffle_epi8(v0, mask); + let v1 = _mm_shuffle_epi8(v1, mask); + let v2 = _mm_shuffle_epi8(v2, mask); + let v3 = _mm_shuffle_epi8(v3, mask); + let tmp0 = _mm_unpacklo_epi32(v0, v1); + let tmp1 = _mm_unpackhi_epi32(v0, v1); + let tmp2 = _mm_unpacklo_epi32(v2, v3); + let tmp3 = _mm_unpackhi_epi32(v2, v3); + let out0 = _mm_unpacklo_epi64(tmp0, tmp2); + let out1 = _mm_unpackhi_epi64(tmp0, tmp2); + let out2 = _mm_unpacklo_epi64(tmp1, tmp3); + let out3 = _mm_unpackhi_epi64(tmp1, tmp3); + token.combine_u16x16( + token.combine_u16x8(out0.simd_into(token), out1.simd_into(token)), + token.combine_u16x8(out2.simd_into(token), out3.simd_into(token)), + ) + } + ); + kernel(self, src) } #[inline(always)] fn store_interleaved_128_u16x32(self, a: u16x32, dest: &mut [u16; 32usize]) -> () { - let (v01, v23) = self.split_u16x32(a); - let (v0, v1) = self.split_u16x16(v01); - let (v2, v3) = self.split_u16x16(v23); - let v0 = v0.into(); - let v1 = v1.into(); - let v2 = v2.into(); - let v3 = v3.into(); - unsafe { - let tmp0 = _mm_unpacklo_epi32(v0, v1); - let tmp1 = _mm_unpackhi_epi32(v0, v1); - let tmp2 = _mm_unpacklo_epi32(v2, v3); - let tmp3 = _mm_unpackhi_epi32(v2, v3); - let out0 = _mm_unpacklo_epi64(tmp0, tmp2); - let out1 = _mm_unpackhi_epi64(tmp0, tmp2); - let out2 = _mm_unpacklo_epi64(tmp1, tmp3); - let out3 = _mm_unpackhi_epi64(tmp1, tmp3); - let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); - let out0 = _mm_shuffle_epi8(out0, mask); - let out1 = _mm_shuffle_epi8(out1, mask); - let out2 = _mm_shuffle_epi8(out2, mask); - let out3 = _mm_shuffle_epi8(out3, mask); - _mm_storeu_si128(dest.as_mut_ptr() as *mut _, out0); - _mm_storeu_si128(dest.as_mut_ptr().add(8usize) as *mut _, out1); - _mm_storeu_si128(dest.as_mut_ptr().add(2 * 8usize) as *mut _, out2); - _mm_storeu_si128(dest.as_mut_ptr().add(3 * 8usize) as *mut _, out3); - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u16x32, dest: &mut [u16; 32usize]) -> () { + let (v01, v23) = token.split_u16x32(a); + let (v0, v1) = token.split_u16x16(v01); + let (v2, v3) = token.split_u16x16(v23); + let v0 = v0.into(); + let v1 = v1.into(); + let v2 = v2.into(); + let v3 = v3.into(); + let tmp0 = _mm_unpacklo_epi32(v0, v1); + let tmp1 = _mm_unpackhi_epi32(v0, v1); + let tmp2 = _mm_unpacklo_epi32(v2, v3); + let tmp3 = _mm_unpackhi_epi32(v2, v3); + let out0 = _mm_unpacklo_epi64(tmp0, tmp2); + let out1 = _mm_unpackhi_epi64(tmp0, tmp2); + let out2 = _mm_unpacklo_epi64(tmp1, tmp3); + let out3 = _mm_unpackhi_epi64(tmp1, tmp3); + let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15); + let out0 = _mm_shuffle_epi8(out0, mask); + let out1 = _mm_shuffle_epi8(out1, mask); + let out2 = _mm_shuffle_epi8(out2, mask); + let out3 = _mm_shuffle_epi8(out3, mask); + let (chunks, []) = dest.as_chunks_mut::<8usize>() else { + unreachable!() + }; + crate::transmute::checked_transmute_store::<__m128i, [u16; 8usize]>( + out0, + &mut chunks[0], + ); + crate::transmute::checked_transmute_store::<__m128i, [u16; 8usize]>( + out1, + &mut chunks[1], + ); + crate::transmute::checked_transmute_store::<__m128i, [u16; 8usize]>( + out2, + &mut chunks[2], + ); + crate::transmute::checked_transmute_store::<__m128i, [u16; 8usize]>( + out3, + &mut chunks[3], + ); + } + ); + kernel(self, a, dest) } #[inline(always)] fn narrow_u16x32(self, a: u16x32) -> u8x32 { @@ -9057,55 +9126,78 @@ impl Simd for Sse4_2 { } #[inline(always)] fn load_interleaved_128_u32x16(self, src: &[u32; 16usize]) -> u32x16 { - let (chunks, []) = src.as_chunks::<4usize>() else { - unreachable!() - }; - let v0: __m128i = - crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[0]); - let v1: __m128i = - crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[1]); - let v2: __m128i = - crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[2]); - let v3: __m128i = - crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[3]); - unsafe { - let tmp0 = _mm_unpacklo_epi32(v0, v1); - let tmp1 = _mm_unpackhi_epi32(v0, v1); - let tmp2 = _mm_unpacklo_epi32(v2, v3); - let tmp3 = _mm_unpackhi_epi32(v2, v3); - let out0 = _mm_unpacklo_epi64(tmp0, tmp2); - let out1 = _mm_unpackhi_epi64(tmp0, tmp2); - let out2 = _mm_unpacklo_epi64(tmp1, tmp3); - let out3 = _mm_unpackhi_epi64(tmp1, tmp3); - self.combine_u32x8( - self.combine_u32x4(out0.simd_into(self), out1.simd_into(self)), - self.combine_u32x4(out2.simd_into(self), out3.simd_into(self)), - ) - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, src: &[u32; 16usize]) -> u32x16 { + let (chunks, []) = src.as_chunks::<4usize>() else { + unreachable!() + }; + let v0: __m128i = + crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[0]); + let v1: __m128i = + crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[1]); + let v2: __m128i = + crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[2]); + let v3: __m128i = + crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[3]); + let tmp0 = _mm_unpacklo_epi32(v0, v1); + let tmp1 = _mm_unpackhi_epi32(v0, v1); + let tmp2 = _mm_unpacklo_epi32(v2, v3); + let tmp3 = _mm_unpackhi_epi32(v2, v3); + let out0 = _mm_unpacklo_epi64(tmp0, tmp2); + let out1 = _mm_unpackhi_epi64(tmp0, tmp2); + let out2 = _mm_unpacklo_epi64(tmp1, tmp3); + let out3 = _mm_unpackhi_epi64(tmp1, tmp3); + token.combine_u32x8( + token.combine_u32x4(out0.simd_into(token), out1.simd_into(token)), + token.combine_u32x4(out2.simd_into(token), out3.simd_into(token)), + ) + } + ); + kernel(self, src) } #[inline(always)] fn store_interleaved_128_u32x16(self, a: u32x16, dest: &mut [u32; 16usize]) -> () { - let (v01, v23) = self.split_u32x16(a); - let (v0, v1) = self.split_u32x8(v01); - let (v2, v3) = self.split_u32x8(v23); - let v0 = v0.into(); - let v1 = v1.into(); - let v2 = v2.into(); - let v3 = v3.into(); - unsafe { - let tmp0 = _mm_unpacklo_epi32(v0, v1); - let tmp1 = _mm_unpackhi_epi32(v0, v1); - let tmp2 = _mm_unpacklo_epi32(v2, v3); - let tmp3 = _mm_unpackhi_epi32(v2, v3); - let out0 = _mm_unpacklo_epi64(tmp0, tmp2); - let out1 = _mm_unpackhi_epi64(tmp0, tmp2); - let out2 = _mm_unpacklo_epi64(tmp1, tmp3); - let out3 = _mm_unpackhi_epi64(tmp1, tmp3); - _mm_storeu_si128(dest.as_mut_ptr() as *mut _, out0); - _mm_storeu_si128(dest.as_mut_ptr().add(4usize) as *mut _, out1); - _mm_storeu_si128(dest.as_mut_ptr().add(2 * 4usize) as *mut _, out2); - _mm_storeu_si128(dest.as_mut_ptr().add(3 * 4usize) as *mut _, out3); - } + crate::kernel!( + #[inline(always)] + fn kernel(token: Sse4_2, a: u32x16, dest: &mut [u32; 16usize]) -> () { + let (v01, v23) = token.split_u32x16(a); + let (v0, v1) = token.split_u32x8(v01); + let (v2, v3) = token.split_u32x8(v23); + let v0 = v0.into(); + let v1 = v1.into(); + let v2 = v2.into(); + let v3 = v3.into(); + let tmp0 = _mm_unpacklo_epi32(v0, v1); + let tmp1 = _mm_unpackhi_epi32(v0, v1); + let tmp2 = _mm_unpacklo_epi32(v2, v3); + let tmp3 = _mm_unpackhi_epi32(v2, v3); + let out0 = _mm_unpacklo_epi64(tmp0, tmp2); + let out1 = _mm_unpackhi_epi64(tmp0, tmp2); + let out2 = _mm_unpacklo_epi64(tmp1, tmp3); + let out3 = _mm_unpackhi_epi64(tmp1, tmp3); + let (chunks, []) = dest.as_chunks_mut::<4usize>() else { + unreachable!() + }; + crate::transmute::checked_transmute_store::<__m128i, [u32; 4usize]>( + out0, + &mut chunks[0], + ); + crate::transmute::checked_transmute_store::<__m128i, [u32; 4usize]>( + out1, + &mut chunks[1], + ); + crate::transmute::checked_transmute_store::<__m128i, [u32; 4usize]>( + out2, + &mut chunks[2], + ); + crate::transmute::checked_transmute_store::<__m128i, [u32; 4usize]>( + out3, + &mut chunks[3], + ); + } + ); + kernel(self, a, dest) } #[inline(always)] fn reinterpret_u8_u32x16(self, a: u32x16) -> u8x64 { diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs index ddc07ab5..9b47d135 100644 --- a/fearless_simd_gen/src/mk_x86.rs +++ b/fearless_simd_gen/src/mk_x86.rs @@ -219,11 +219,11 @@ impl Level for X86 { OpSig::LoadInterleaved { block_size, block_count, - } => self.handle_load_interleaved(method_sig, vec_ty, block_size, block_count), + } => self.handle_load_interleaved(op, vec_ty, block_size, block_count), OpSig::StoreInterleaved { block_size, block_count, - } => self.handle_store_interleaved(method_sig, vec_ty, block_size, block_count), + } => self.handle_store_interleaved(op, vec_ty, block_size, block_count), OpSig::FromArray { kind } => generic_from_array(method_sig, vec_ty, kind), OpSig::AsArray { kind } => { generic_as_array(method_sig, vec_ty, kind, self.max_block_size(), |vec_ty| { @@ -1781,7 +1781,7 @@ impl X86 { pub(crate) fn handle_load_interleaved( &self, - method_sig: TokenStream, + op: Op, vec_ty: &VecType, block_size: u16, block_count: u16, @@ -1791,7 +1791,7 @@ impl X86 { "only 128-bit blocks are currently supported" ); assert_eq!(block_count, 4, "only count of 4 is currently supported"); - let expr = match vec_ty.scalar_bits { + match vec_ty.scalar_bits { 32 | 16 | 8 => { let block_ty = VecType::new(vec_ty.scalar, vec_ty.scalar_bits, 128 / vec_ty.scalar_bits); @@ -1876,24 +1876,24 @@ impl X86 { } }; - quote! { - let (chunks, []) = src.as_chunks::<#block_len>() else { - unreachable!() - }; - let v0: #native_ty = crate::transmute::checked_transmute_copy::<[#scalar_ty; #block_len], #native_ty>( - &chunks[0], - ); - let v1: #native_ty = crate::transmute::checked_transmute_copy::<[#scalar_ty; #block_len], #native_ty>( - &chunks[1], - ); - let v2: #native_ty = crate::transmute::checked_transmute_copy::<[#scalar_ty; #block_len], #native_ty>( - &chunks[2], - ); - let v3: #native_ty = crate::transmute::checked_transmute_copy::<[#scalar_ty; #block_len], #native_ty>( - &chunks[3], - ); + self.kernel_method(op, vec_ty, |token| { + quote! { + let (chunks, []) = src.as_chunks::<#block_len>() else { + unreachable!() + }; + let v0: #native_ty = crate::transmute::checked_transmute_copy::<[#scalar_ty; #block_len], #native_ty>( + &chunks[0], + ); + let v1: #native_ty = crate::transmute::checked_transmute_copy::<[#scalar_ty; #block_len], #native_ty>( + &chunks[1], + ); + let v2: #native_ty = crate::transmute::checked_transmute_copy::<[#scalar_ty; #block_len], #native_ty>( + &chunks[2], + ); + let v3: #native_ty = crate::transmute::checked_transmute_copy::<[#scalar_ty; #block_len], #native_ty>( + &chunks[3], + ); - unsafe { #init_shuffle let tmp0 = #unpacklo_32(v0, v1); // [0,4,1,5] @@ -1903,26 +1903,20 @@ impl X86 { #final_unpack - self.#combine_full( - self.#combine_half(out0.simd_into(self), out1.simd_into(self)), - self.#combine_half(out2.simd_into(self), out3.simd_into(self)), + #token.#combine_full( + #token.#combine_half(out0.simd_into(#token), out1.simd_into(#token)), + #token.#combine_half(out2.simd_into(#token), out3.simd_into(#token)), ) } - } + }) } _ => unimplemented!(), - }; - - quote! { - #method_sig { - #expr - } } } pub(crate) fn handle_store_interleaved( &self, - method_sig: TokenStream, + op: Op, vec_ty: &VecType, block_size: u16, block_count: u16, @@ -1932,12 +1926,12 @@ impl X86 { "only 128-bit blocks are currently supported" ); assert_eq!(block_count, 4, "only count of 4 is currently supported"); - let expr = match vec_ty.scalar_bits { + match vec_ty.scalar_bits { 32 | 16 | 8 => { let block_ty = VecType::new(vec_ty.scalar, vec_ty.scalar_bits, 128 / vec_ty.scalar_bits); - let store_unaligned = - intrinsic_ident("storeu", coarse_type(&block_ty), block_ty.n_bits()); + let scalar_ty = block_ty.scalar.rust(block_ty.scalar_bits); + let native_ty = self.arch_ty(&block_ty); let vec_32 = block_ty.reinterpret(block_ty.scalar, 32); let unpacklo_32 = simple_sign_unaware_intrinsic("unpacklo", &vec_32); let unpackhi_32 = simple_sign_unaware_intrinsic("unpackhi", &vec_32); @@ -2015,16 +2009,16 @@ impl X86 { } }; - quote! { - let (v01, v23) = self.#split_full(a); - let (v0, v1) = self.#split_half(v01); - let (v2, v3) = self.#split_half(v23); - let v0 = v0.into(); - let v1 = v1.into(); - let v2 = v2.into(); - let v3 = v3.into(); + self.kernel_method(op, vec_ty, |token| { + quote! { + let (v01, v23) = #token.#split_full(a); + let (v0, v1) = #token.#split_half(v01); + let (v2, v3) = #token.#split_half(v23); + let v0 = v0.into(); + let v1 = v1.into(); + let v2 = v2.into(); + let v3 = v3.into(); - unsafe { let tmp0 = #unpacklo_32(v0, v1); // [0,4,1,5] let tmp1 = #unpackhi_32(v0, v1); // [2,6,3,7] let tmp2 = #unpacklo_32(v2, v3); // [8,12,9,13] @@ -2034,20 +2028,18 @@ impl X86 { #post_shuffle - #store_unaligned(dest.as_mut_ptr() as *mut _, out0); - #store_unaligned(dest.as_mut_ptr().add(#block_len) as *mut _, out1); - #store_unaligned(dest.as_mut_ptr().add(2 * #block_len) as *mut _, out2); - #store_unaligned(dest.as_mut_ptr().add(3 * #block_len) as *mut _, out3); + let (chunks, []) = dest.as_chunks_mut::<#block_len>() else { + unreachable!() + }; + + crate::transmute::checked_transmute_store::<#native_ty, [#scalar_ty; #block_len]>(out0, &mut chunks[0]); + crate::transmute::checked_transmute_store::<#native_ty, [#scalar_ty; #block_len]>(out1, &mut chunks[1]); + crate::transmute::checked_transmute_store::<#native_ty, [#scalar_ty; #block_len]>(out2, &mut chunks[2]); + crate::transmute::checked_transmute_store::<#native_ty, [#scalar_ty; #block_len]>(out3, &mut chunks[3]); } - } + }) } _ => unimplemented!(), - }; - - quote! { - #method_sig { - #expr - } } } From c40587b476191f42089d156a5b8315a5dbc19e03 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Tue, 16 Jun 2026 22:47:21 +0100 Subject: [PATCH 3/5] Placate Clippy --- fearless_simd/src/generated/avx2.rs | 8 ++++---- fearless_simd/src/generated/simd_trait.rs | 14 +++++++------- fearless_simd/src/generated/sse4_2.rs | 8 ++++---- fearless_simd_gen/src/ops.rs | 10 ++++++++-- 4 files changed, 23 insertions(+), 17 deletions(-) diff --git a/fearless_simd/src/generated/avx2.rs b/fearless_simd/src/generated/avx2.rs index fb22baec..361a98b2 100644 --- a/fearless_simd/src/generated/avx2.rs +++ b/fearless_simd/src/generated/avx2.rs @@ -8676,7 +8676,7 @@ impl Simd for Avx2 { ); } ); - kernel(self, a, dest) + kernel(self, a, dest); } #[inline(always)] fn reinterpret_u8_f32x16(self, a: f32x16) -> u8x64 { @@ -9326,7 +9326,7 @@ impl Simd for Avx2 { ); } ); - kernel(self, a, dest) + kernel(self, a, dest); } #[inline(always)] fn reinterpret_u32_u8x64(self, a: u8x64) -> u32x16 { @@ -10101,7 +10101,7 @@ impl Simd for Avx2 { ); } ); - kernel(self, a, dest) + kernel(self, a, dest); } #[inline(always)] fn narrow_u16x32(self, a: u16x32) -> u8x32 { @@ -10858,7 +10858,7 @@ impl Simd for Avx2 { ); } ); - kernel(self, a, dest) + kernel(self, a, dest); } #[inline(always)] fn reinterpret_u8_u32x16(self, a: u32x16) -> u8x64 { diff --git a/fearless_simd/src/generated/simd_trait.rs b/fearless_simd/src/generated/simd_trait.rs index 1510e274..d68be3a4 100644 --- a/fearless_simd/src/generated/simd_trait.rs +++ b/fearless_simd/src/generated/simd_trait.rs @@ -150,7 +150,7 @@ pub trait Simd: fn neg_f32x4(self, a: f32x4) -> f32x4; #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."] fn sqrt_f32x4(self, a: f32x4) -> f32x4; - #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."] + #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."] fn approximate_recip_f32x4(self, a: f32x4) -> f32x4; #[doc = "Add two vectors element-wise."] fn add_f32x4(self, a: f32x4, b: f32x4) -> f32x4; @@ -869,7 +869,7 @@ pub trait Simd: fn neg_f64x2(self, a: f64x2) -> f64x2; #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."] fn sqrt_f64x2(self, a: f64x2) -> f64x2; - #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."] + #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."] fn approximate_recip_f64x2(self, a: f64x2) -> f64x2; #[doc = "Add two vectors element-wise."] fn add_f64x2(self, a: f64x2, b: f64x2) -> f64x2; @@ -1000,7 +1000,7 @@ pub trait Simd: fn neg_f32x8(self, a: f32x8) -> f32x8; #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."] fn sqrt_f32x8(self, a: f32x8) -> f32x8; - #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."] + #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."] fn approximate_recip_f32x8(self, a: f32x8) -> f32x8; #[doc = "Add two vectors element-wise."] fn add_f32x8(self, a: f32x8, b: f32x8) -> f32x8; @@ -1741,7 +1741,7 @@ pub trait Simd: fn neg_f64x4(self, a: f64x4) -> f64x4; #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."] fn sqrt_f64x4(self, a: f64x4) -> f64x4; - #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."] + #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."] fn approximate_recip_f64x4(self, a: f64x4) -> f64x4; #[doc = "Add two vectors element-wise."] fn add_f64x4(self, a: f64x4, b: f64x4) -> f64x4; @@ -1876,7 +1876,7 @@ pub trait Simd: fn neg_f32x16(self, a: f32x16) -> f32x16; #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."] fn sqrt_f32x16(self, a: f32x16) -> f32x16; - #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."] + #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."] fn approximate_recip_f32x16(self, a: f32x16) -> f32x16; #[doc = "Add two vectors element-wise."] fn add_f32x16(self, a: f32x16, b: f32x16) -> f32x16; @@ -2611,7 +2611,7 @@ pub trait Simd: fn neg_f64x8(self, a: f64x8) -> f64x8; #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."] fn sqrt_f64x8(self, a: f64x8) -> f64x8; - #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."] + #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."] fn approximate_recip_f64x8(self, a: f64x8) -> f64x8; #[doc = "Add two vectors element-wise."] fn add_f64x8(self, a: f64x8, b: f64x8) -> f64x8; @@ -2862,7 +2862,7 @@ pub trait SimdFloat: fn abs(self) -> Self; #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."] fn sqrt(self) -> Self; - #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."] + #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."] fn approximate_recip(self) -> Self; #[doc = "Return a vector with the magnitude of `self` and the sign of `rhs` for each element.\n\nThis operation copies the sign bit, so if an input element is NaN, the output element will be a NaN with the same payload and a copied sign bit."] fn copysign(self, rhs: impl SimdInto) -> Self; diff --git a/fearless_simd/src/generated/sse4_2.rs b/fearless_simd/src/generated/sse4_2.rs index 7966c826..64d93783 100644 --- a/fearless_simd/src/generated/sse4_2.rs +++ b/fearless_simd/src/generated/sse4_2.rs @@ -7021,7 +7021,7 @@ impl Simd for Sse4_2 { ); } ); - kernel(self, a, dest) + kernel(self, a, dest); } #[inline(always)] fn reinterpret_u8_f32x16(self, a: f32x16) -> u8x64 { @@ -7671,7 +7671,7 @@ impl Simd for Sse4_2 { ); } ); - kernel(self, a, dest) + kernel(self, a, dest); } #[inline(always)] fn reinterpret_u32_u8x64(self, a: u8x64) -> u32x16 { @@ -8452,7 +8452,7 @@ impl Simd for Sse4_2 { ); } ); - kernel(self, a, dest) + kernel(self, a, dest); } #[inline(always)] fn narrow_u16x32(self, a: u16x32) -> u8x32 { @@ -9197,7 +9197,7 @@ impl Simd for Sse4_2 { ); } ); - kernel(self, a, dest) + kernel(self, a, dest); } #[inline(always)] fn reinterpret_u8_u32x16(self, a: u32x16) -> u8x64 { diff --git a/fearless_simd_gen/src/ops.rs b/fearless_simd_gen/src/ops.rs index 3ef04fb0..2cd2659e 100644 --- a/fearless_simd_gen/src/ops.rs +++ b/fearless_simd_gen/src/ops.rs @@ -227,6 +227,12 @@ impl Op { let arg_decls = sig.arg_decls(); let call_args = &sig.arg_names; let ret = &sig.ret; + let kernel_call = if matches!(self.sig, OpSig::StoreInterleaved { .. } | OpSig::StoreArray) + { + quote! { kernel(self #(, #call_args)*); } + } else { + quote! { kernel(self #(, #call_args)*) } + }; quote! { #method_sig { @@ -237,7 +243,7 @@ impl Op { } ); - kernel(self #(, #call_args)*) + #kernel_call } } } @@ -641,7 +647,7 @@ const FLOAT_OPS: &[Op] = &[ "Compute an approximate reciprocal (`1. / x`) for each element.\n\n\ This uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\n\ On x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. \ - On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. \ + On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. \ The precision of this operation may change as new platform support is added.", ), Op::new( From 1de7ad04bf66c177a7412c3905cfcf572acb038c Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Wed, 17 Jun 2026 15:54:38 +0100 Subject: [PATCH 4/5] Add a test for load_interleaved_128_f32x16() which was not previously covered --- fearless_simd_tests/tests/harness/mod.rs | 44 ++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/fearless_simd_tests/tests/harness/mod.rs b/fearless_simd_tests/tests/harness/mod.rs index 727b5b07..4ef94926 100644 --- a/fearless_simd_tests/tests/harness/mod.rs +++ b/fearless_simd_tests/tests/harness/mod.rs @@ -816,6 +816,50 @@ fn all_false_mask8x16(simd: S) { assert!(!simd.all_false_mask8x16(one_neg)); } +#[simd_test] +fn load_interleaved_128_f32x16(simd: S) { + let data = [ + 0.0, + 4.0, + 8.0, + f32::MIN, + f32::NAN, + -0.0, + 9.0, + 13.0, + f32::INFINITY, + 6.0, + -10.0, + f32::MAX, + -3.0, + f32::NEG_INFINITY, + 11.0, + 15.0, + ]; + let expected = [ + 0.0, + f32::NAN, + f32::INFINITY, + -3.0, + 4.0, + -0.0, + 6.0, + f32::NEG_INFINITY, + 8.0, + 9.0, + -10.0, + 11.0, + f32::MIN, + 13.0, + f32::MAX, + 15.0, + ]; + + // Note: f32::NAN != f32::NAN hence we compare the bit pattern. + let result = simd.load_interleaved_128_f32x16(&data); + assert_eq!((*result).map(f32::to_bits), expected.map(f32::to_bits),); +} + #[simd_test] fn load_interleaved_128_u32x16(simd: S) { #[rustfmt::skip] From aa6b43cd14495280956b72427a3c8e79c1f7957c Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Wed, 17 Jun 2026 15:59:06 +0100 Subject: [PATCH 5/5] Clarify documentation on load_interleaved/store_interleaved --- fearless_simd/src/generated/simd_trait.rs | 16 ++++++++-------- fearless_simd_gen/src/ops.rs | 16 ++++++++++++++-- 2 files changed, 22 insertions(+), 10 deletions(-) diff --git a/fearless_simd/src/generated/simd_trait.rs b/fearless_simd/src/generated/simd_trait.rs index d68be3a4..bcef3ed5 100644 --- a/fearless_simd/src/generated/simd_trait.rs +++ b/fearless_simd/src/generated/simd_trait.rs @@ -1940,9 +1940,9 @@ pub trait Simd: fn reinterpret_f64_f32x16(self, a: f32x16) -> f64x8; #[doc = "Reinterpret the bits of this vector as a vector of `i32` elements.\n\nThis is a bitwise reinterpretation only, and does not perform any conversions."] fn reinterpret_i32_f32x16(self, a: f32x16) -> i32x16; - #[doc = "Load elements from an array with 4-way interleaving.\n\nReads consecutive elements and deinterleaves them into a single vector."] + #[doc = "Load elements from an array with 4-way interleaving.\n\nThis is different from loading a vector and calling `interleave`: `interleave` combines two already-loaded vectors, while this operation treats memory as four consecutive 128-bit blocks and transposes those blocks into one vector.\n\nFor example, with 32-bit lanes, memory laid out as `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]` loads as `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]`."] fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16; - #[doc = "Store elements to an array with 4-way interleaving.\n\nInterleaves the vector elements and writes them consecutively to memory."] + #[doc = "Store elements to an array with 4-way interleaving.\n\nThis is the inverse of `load_interleaved_128`. It is different from calling `interleave` and then storing: `interleave` combines two already-loaded vectors, while this operation transposes one vector into four consecutive 128-bit blocks in memory.\n\nFor example, with 32-bit lanes, a vector containing `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]` stores as `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]`."] fn store_interleaved_128_f32x16(self, a: f32x16, dest: &mut [f32; 16usize]) -> (); #[doc = "Reinterpret the bits of this vector as a vector of `u8` elements.\n\nThe total bit width is preserved; the number of elements changes accordingly."] fn reinterpret_u8_f32x16(self, a: f32x16) -> u8x64; @@ -2118,9 +2118,9 @@ pub trait Simd: fn max_u8x64(self, a: u8x64, b: u8x64) -> u8x64; #[doc = "Split a vector into two vectors of half the width.\n\nReturns a tuple of (lower half, upper half)."] fn split_u8x64(self, a: u8x64) -> (u8x32, u8x32); - #[doc = "Load elements from an array with 4-way interleaving.\n\nReads consecutive elements and deinterleaves them into a single vector."] + #[doc = "Load elements from an array with 4-way interleaving.\n\nThis is different from loading a vector and calling `interleave`: `interleave` combines two already-loaded vectors, while this operation treats memory as four consecutive 128-bit blocks and transposes those blocks into one vector.\n\nFor example, with 32-bit lanes, memory laid out as `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]` loads as `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]`."] fn load_interleaved_128_u8x64(self, src: &[u8; 64usize]) -> u8x64; - #[doc = "Store elements to an array with 4-way interleaving.\n\nInterleaves the vector elements and writes them consecutively to memory."] + #[doc = "Store elements to an array with 4-way interleaving.\n\nThis is the inverse of `load_interleaved_128`. It is different from calling `interleave` and then storing: `interleave` combines two already-loaded vectors, while this operation transposes one vector into four consecutive 128-bit blocks in memory.\n\nFor example, with 32-bit lanes, a vector containing `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]` stores as `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]`."] fn store_interleaved_128_u8x64(self, a: u8x64, dest: &mut [u8; 64usize]) -> (); #[doc = "Reinterpret the bits of this vector as a vector of `u32` elements.\n\nThe total bit width is preserved; the number of elements changes accordingly."] fn reinterpret_u32_u8x64(self, a: u8x64) -> u32x16; @@ -2323,9 +2323,9 @@ pub trait Simd: fn max_u16x32(self, a: u16x32, b: u16x32) -> u16x32; #[doc = "Split a vector into two vectors of half the width.\n\nReturns a tuple of (lower half, upper half)."] fn split_u16x32(self, a: u16x32) -> (u16x16, u16x16); - #[doc = "Load elements from an array with 4-way interleaving.\n\nReads consecutive elements and deinterleaves them into a single vector."] + #[doc = "Load elements from an array with 4-way interleaving.\n\nThis is different from loading a vector and calling `interleave`: `interleave` combines two already-loaded vectors, while this operation treats memory as four consecutive 128-bit blocks and transposes those blocks into one vector.\n\nFor example, with 32-bit lanes, memory laid out as `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]` loads as `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]`."] fn load_interleaved_128_u16x32(self, src: &[u16; 32usize]) -> u16x32; - #[doc = "Store elements to an array with 4-way interleaving.\n\nInterleaves the vector elements and writes them consecutively to memory."] + #[doc = "Store elements to an array with 4-way interleaving.\n\nThis is the inverse of `load_interleaved_128`. It is different from calling `interleave` and then storing: `interleave` combines two already-loaded vectors, while this operation transposes one vector into four consecutive 128-bit blocks in memory.\n\nFor example, with 32-bit lanes, a vector containing `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]` stores as `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]`."] fn store_interleaved_128_u16x32(self, a: u16x32, dest: &mut [u16; 32usize]) -> (); #[doc = "Truncate each element to a narrower integer type.\n\nThe number of elements in the result is twice that of the input."] fn narrow_u16x32(self, a: u16x32) -> u8x32; @@ -2534,9 +2534,9 @@ pub trait Simd: fn max_u32x16(self, a: u32x16, b: u32x16) -> u32x16; #[doc = "Split a vector into two vectors of half the width.\n\nReturns a tuple of (lower half, upper half)."] fn split_u32x16(self, a: u32x16) -> (u32x8, u32x8); - #[doc = "Load elements from an array with 4-way interleaving.\n\nReads consecutive elements and deinterleaves them into a single vector."] + #[doc = "Load elements from an array with 4-way interleaving.\n\nThis is different from loading a vector and calling `interleave`: `interleave` combines two already-loaded vectors, while this operation treats memory as four consecutive 128-bit blocks and transposes those blocks into one vector.\n\nFor example, with 32-bit lanes, memory laid out as `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]` loads as `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]`."] fn load_interleaved_128_u32x16(self, src: &[u32; 16usize]) -> u32x16; - #[doc = "Store elements to an array with 4-way interleaving.\n\nInterleaves the vector elements and writes them consecutively to memory."] + #[doc = "Store elements to an array with 4-way interleaving.\n\nThis is the inverse of `load_interleaved_128`. It is different from calling `interleave` and then storing: `interleave` combines two already-loaded vectors, while this operation transposes one vector into four consecutive 128-bit blocks in memory.\n\nFor example, with 32-bit lanes, a vector containing `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]` stores as `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]`."] fn store_interleaved_128_u32x16(self, a: u32x16, dest: &mut [u32; 16usize]) -> (); #[doc = "Reinterpret the bits of this vector as a vector of `u8` elements.\n\nThe total bit width is preserved; the number of elements changes accordingly."] fn reinterpret_u8_u32x16(self, a: u32x16) -> u8x64; diff --git a/fearless_simd_gen/src/ops.rs b/fearless_simd_gen/src/ops.rs index 2cd2659e..5bc18941 100644 --- a/fearless_simd_gen/src/ops.rs +++ b/fearless_simd_gen/src/ops.rs @@ -1353,7 +1353,13 @@ pub(crate) fn ops_for_type(ty: &VecType) -> Vec { block_size: 128, block_count: 4, }, - "Load elements from an array with 4-way interleaving.\n\nReads consecutive elements and deinterleaves them into a single vector.", + "Load elements from an array with 4-way interleaving.\n\n\ + This is different from loading a vector and calling `interleave`: `interleave` combines two already-loaded \ + vectors, while this operation treats memory as four consecutive 128-bit blocks and transposes those blocks \ + into one vector.\n\n\ + For example, with 32-bit lanes, memory laid out as \ + `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]` loads as \ + `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]`.", )); } @@ -1365,7 +1371,13 @@ pub(crate) fn ops_for_type(ty: &VecType) -> Vec { block_size: 128, block_count: 4, }, - "Store elements to an array with 4-way interleaving.\n\nInterleaves the vector elements and writes them consecutively to memory.", + "Store elements to an array with 4-way interleaving.\n\n\ + This is the inverse of `load_interleaved_128`. It is different from calling `interleave` and then storing: \ + `interleave` combines two already-loaded vectors, while this operation transposes one vector into four \ + consecutive 128-bit blocks in memory.\n\n\ + For example, with 32-bit lanes, a vector containing \ + `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]` stores as \ + `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]`.", )); }