From 9a132fb667def7a6200a0ac57843db1705835f18 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Tue, 16 Jun 2026 22:36:32 +0100
Subject: [PATCH 1/5] Use safe stores in WASM store_interleaved

---
 fearless_simd/src/generated/wasm.rs | 52 ++++++++++++++++-------------
 fearless_simd_gen/src/mk_wasm.rs    | 15 +++++----
 2 files changed, 37 insertions(+), 30 deletions(-)
diff --git a/fearless_simd/src/generated/wasm.rs b/fearless_simd/src/generated/wasm.rs
index 2963ca6b..a63f74e7 100644
--- a/fearless_simd/src/generated/wasm.rs
+++ b/fearless_simd/src/generated/wasm.rs
@@ -5384,12 +5384,13 @@ impl Simd for WasmSimd128 {
         let out1 = u32x4_shuffle::<2, 6, 3, 7>(v02_lower, v13_lower);
         let out2 = u32x4_shuffle::<0, 4, 1, 5>(v02_upper, v13_upper);
         let out3 = u32x4_shuffle::<2, 6, 3, 7>(v02_upper, v13_upper);
-        unsafe {
-            v128_store(dest[0 * 4usize..].as_mut_ptr() as *mut v128, out0);
-            v128_store(dest[1 * 4usize..].as_mut_ptr() as *mut v128, out1);
-            v128_store(dest[2 * 4usize..].as_mut_ptr() as *mut v128, out2);
-            v128_store(dest[3 * 4usize..].as_mut_ptr() as *mut v128, out3);
-        }
+        let (chunks, []) = dest.as_chunks_mut::<4usize>() else {
+            unreachable!()
+        };
+        crate::transmute::checked_transmute_store::<v128, [f32; 4usize]>(out0, &mut chunks[0]);
+        crate::transmute::checked_transmute_store::<v128, [f32; 4usize]>(out1, &mut chunks[1]);
+        crate::transmute::checked_transmute_store::<v128, [f32; 4usize]>(out2, &mut chunks[2]);
+        crate::transmute::checked_transmute_store::<v128, [f32; 4usize]>(out3, &mut chunks[3]);
     }
     #[inline(always)]
     fn reinterpret_u8_f32x16(self, a: f32x16<Self>) -> u8x64<Self> {
@@ -6018,12 +6019,13 @@ impl Simd for WasmSimd128 {
         let out3 = u8x16_shuffle::<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(
             v02_upper, v13_upper,
         );
-        unsafe {
-            v128_store(dest[0 * 16usize..].as_mut_ptr() as *mut v128, out0);
-            v128_store(dest[1 * 16usize..].as_mut_ptr() as *mut v128, out1);
-            v128_store(dest[2 * 16usize..].as_mut_ptr() as *mut v128, out2);
-            v128_store(dest[3 * 16usize..].as_mut_ptr() as *mut v128, out3);
-        }
+        let (chunks, []) = dest.as_chunks_mut::<16usize>() else {
+            unreachable!()
+        };
+        crate::transmute::checked_transmute_store::<v128, [u8; 16usize]>(out0, &mut chunks[0]);
+        crate::transmute::checked_transmute_store::<v128, [u8; 16usize]>(out1, &mut chunks[1]);
+        crate::transmute::checked_transmute_store::<v128, [u8; 16usize]>(out2, &mut chunks[2]);
+        crate::transmute::checked_transmute_store::<v128, [u8; 16usize]>(out3, &mut chunks[3]);
     }
     #[inline(always)]
     fn reinterpret_u32_u8x64(self, a: u8x64<Self>) -> u32x16<Self> {
@@ -6718,12 +6720,13 @@ impl Simd for WasmSimd128 {
         let out1 = u16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(v02_lower, v13_lower);
         let out2 = u16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(v02_upper, v13_upper);
         let out3 = u16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(v02_upper, v13_upper);
-        unsafe {
-            v128_store(dest[0 * 8usize..].as_mut_ptr() as *mut v128, out0);
-            v128_store(dest[1 * 8usize..].as_mut_ptr() as *mut v128, out1);
-            v128_store(dest[2 * 8usize..].as_mut_ptr() as *mut v128, out2);
-            v128_store(dest[3 * 8usize..].as_mut_ptr() as *mut v128, out3);
-        }
+        let (chunks, []) = dest.as_chunks_mut::<8usize>() else {
+            unreachable!()
+        };
+        crate::transmute::checked_transmute_store::<v128, [u16; 8usize]>(out0, &mut chunks[0]);
+        crate::transmute::checked_transmute_store::<v128, [u16; 8usize]>(out1, &mut chunks[1]);
+        crate::transmute::checked_transmute_store::<v128, [u16; 8usize]>(out2, &mut chunks[2]);
+        crate::transmute::checked_transmute_store::<v128, [u16; 8usize]>(out3, &mut chunks[3]);
     }
     #[inline(always)]
     fn narrow_u16x32(self, a: u16x32<Self>) -> u8x32<Self> {
@@ -7424,12 +7427,13 @@ impl Simd for WasmSimd128 {
         let out1 = u32x4_shuffle::<2, 6, 3, 7>(v02_lower, v13_lower);
         let out2 = u32x4_shuffle::<0, 4, 1, 5>(v02_upper, v13_upper);
         let out3 = u32x4_shuffle::<2, 6, 3, 7>(v02_upper, v13_upper);
-        unsafe {
-            v128_store(dest[0 * 4usize..].as_mut_ptr() as *mut v128, out0);
-            v128_store(dest[1 * 4usize..].as_mut_ptr() as *mut v128, out1);
-            v128_store(dest[2 * 4usize..].as_mut_ptr() as *mut v128, out2);
-            v128_store(dest[3 * 4usize..].as_mut_ptr() as *mut v128, out3);
-        }
+        let (chunks, []) = dest.as_chunks_mut::<4usize>() else {
+            unreachable!()
+        };
+        crate::transmute::checked_transmute_store::<v128, [u32; 4usize]>(out0, &mut chunks[0]);
+        crate::transmute::checked_transmute_store::<v128, [u32; 4usize]>(out1, &mut chunks[1]);
+        crate::transmute::checked_transmute_store::<v128, [u32; 4usize]>(out2, &mut chunks[2]);
+        crate::transmute::checked_transmute_store::<v128, [u32; 4usize]>(out3, &mut chunks[3]);
     }
     #[inline(always)]
     fn reinterpret_u8_u32x16(self, a: u32x16<Self>) -> u8x64<Self> {
diff --git a/fearless_simd_gen/src/mk_wasm.rs b/fearless_simd_gen/src/mk_wasm.rs
index 1a9d35bf..b8404fbb 100644
--- a/fearless_simd_gen/src/mk_wasm.rs
+++ b/fearless_simd_gen/src/mk_wasm.rs
@@ -685,6 +685,7 @@ impl Level for WasmSimd128 {
             } => {
                 assert_eq!(block_count, 4, "only count of 4 is currently supported");
                 let elems_per_vec = block_size as usize / vec_ty.scalar_bits;
+                let scalar_ty = vec_ty.scalar.rust(vec_ty.scalar_bits);
 
                 let (lower_indices, upper_indices, shuffle_fn) = match vec_ty.scalar_bits {
                     8 => (
@@ -743,12 +744,14 @@ impl Level for WasmSimd128 {
                         let out2 = #shuffle_fn::<#lower_indices>(v02_upper, v13_upper);
                         let out3 = #shuffle_fn::<#upper_indices>(v02_upper, v13_upper);
 
-                        unsafe {
-                            v128_store(dest[0 * #elems_per_vec..].as_mut_ptr() as *mut v128, out0);
-                            v128_store(dest[1 * #elems_per_vec..].as_mut_ptr() as *mut v128, out1);
-                            v128_store(dest[2 * #elems_per_vec..].as_mut_ptr() as *mut v128, out2);
-                            v128_store(dest[3 * #elems_per_vec..].as_mut_ptr() as *mut v128, out3);
-                        }
+                        let (chunks, []) = dest.as_chunks_mut::<#elems_per_vec>() else {
+                            unreachable!()
+                        };
+
+                        crate::transmute::checked_transmute_store::<v128, [#scalar_ty; #elems_per_vec]>(out0, &mut chunks[0]);
+                        crate::transmute::checked_transmute_store::<v128, [#scalar_ty; #elems_per_vec]>(out1, &mut chunks[1]);
+                        crate::transmute::checked_transmute_store::<v128, [#scalar_ty; #elems_per_vec]>(out2, &mut chunks[2]);
+                        crate::transmute::checked_transmute_store::<v128, [#scalar_ty; #elems_per_vec]>(out3, &mut chunks[3]);
                     }
                 }
             }

From e0b3b2db37283fd4e9a2d114f34991f4b73212dd Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Tue, 16 Jun 2026 22:43:40 +0100
Subject: [PATCH 2/5] Make x86 load/store_interleaved implementation safe using
 kernel! and checked_transmute_store

---
 fearless_simd/src/generated/avx2.rs   | 500 +++++++++++++++-----------
 fearless_simd/src/generated/sse4_2.rs | 500 +++++++++++++++-----------
 fearless_simd_gen/src/mk_x86.rs       | 102 +++---
 3 files changed, 639 insertions(+), 463 deletions(-)

diff --git a/fearless_simd/src/generated/avx2.rs b/fearless_simd/src/generated/avx2.rs
index 216c6562..fb22baec 100644
--- a/fearless_simd/src/generated/avx2.rs
+++ b/fearless_simd/src/generated/avx2.rs
@@ -8605,55 +8605,78 @@ impl Simd for Avx2 {
     }
     #[inline(always)]
     fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16<Self> {
-        let (chunks, []) = src.as_chunks::<4usize>() else {
-            unreachable!()
-        };
-        let v0: __m128 =
-            crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[0]);
-        let v1: __m128 =
-            crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[1]);
-        let v2: __m128 =
-            crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[2]);
-        let v3: __m128 =
-            crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[3]);
-        unsafe {
-            let tmp0 = _mm_unpacklo_ps(v0, v1);
-            let tmp1 = _mm_unpackhi_ps(v0, v1);
-            let tmp2 = _mm_unpacklo_ps(v2, v3);
-            let tmp3 = _mm_unpackhi_ps(v2, v3);
-            let out0 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
-            let out1 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
-            let out2 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
-            let out3 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
-            self.combine_f32x8(
-                self.combine_f32x4(out0.simd_into(self), out1.simd_into(self)),
-                self.combine_f32x4(out2.simd_into(self), out3.simd_into(self)),
-            )
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, src: &[f32; 16usize]) -> f32x16<Avx2> {
+                let (chunks, []) = src.as_chunks::<4usize>() else {
+                    unreachable!()
+                };
+                let v0: __m128 =
+                    crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[0]);
+                let v1: __m128 =
+                    crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[1]);
+                let v2: __m128 =
+                    crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[2]);
+                let v3: __m128 =
+                    crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[3]);
+                let tmp0 = _mm_unpacklo_ps(v0, v1);
+                let tmp1 = _mm_unpackhi_ps(v0, v1);
+                let tmp2 = _mm_unpacklo_ps(v2, v3);
+                let tmp3 = _mm_unpackhi_ps(v2, v3);
+                let out0 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
+                let out1 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
+                let out2 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
+                let out3 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
+                token.combine_f32x8(
+                    token.combine_f32x4(out0.simd_into(token), out1.simd_into(token)),
+                    token.combine_f32x4(out2.simd_into(token), out3.simd_into(token)),
+                )
+            }
+        );
+        kernel(self, src)
     }
     #[inline(always)]
     fn store_interleaved_128_f32x16(self, a: f32x16<Self>, dest: &mut [f32; 16usize]) -> () {
-        let (v01, v23) = self.split_f32x16(a);
-        let (v0, v1) = self.split_f32x8(v01);
-        let (v2, v3) = self.split_f32x8(v23);
-        let v0 = v0.into();
-        let v1 = v1.into();
-        let v2 = v2.into();
-        let v3 = v3.into();
-        unsafe {
-            let tmp0 = _mm_unpacklo_ps(v0, v1);
-            let tmp1 = _mm_unpackhi_ps(v0, v1);
-            let tmp2 = _mm_unpacklo_ps(v2, v3);
-            let tmp3 = _mm_unpackhi_ps(v2, v3);
-            let out0 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
-            let out1 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
-            let out2 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
-            let out3 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
-            _mm_storeu_ps(dest.as_mut_ptr() as *mut _, out0);
-            _mm_storeu_ps(dest.as_mut_ptr().add(4usize) as *mut _, out1);
-            _mm_storeu_ps(dest.as_mut_ptr().add(2 * 4usize) as *mut _, out2);
-            _mm_storeu_ps(dest.as_mut_ptr().add(3 * 4usize) as *mut _, out3);
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: f32x16<Avx2>, dest: &mut [f32; 16usize]) -> () {
+                let (v01, v23) = token.split_f32x16(a);
+                let (v0, v1) = token.split_f32x8(v01);
+                let (v2, v3) = token.split_f32x8(v23);
+                let v0 = v0.into();
+                let v1 = v1.into();
+                let v2 = v2.into();
+                let v3 = v3.into();
+                let tmp0 = _mm_unpacklo_ps(v0, v1);
+                let tmp1 = _mm_unpackhi_ps(v0, v1);
+                let tmp2 = _mm_unpacklo_ps(v2, v3);
+                let tmp3 = _mm_unpackhi_ps(v2, v3);
+                let out0 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
+                let out1 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
+                let out2 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
+                let out3 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
+                let (chunks, []) = dest.as_chunks_mut::<4usize>() else {
+                    unreachable!()
+                };
+                crate::transmute::checked_transmute_store::<__m128, [f32; 4usize]>(
+                    out0,
+                    &mut chunks[0],
+                );
+                crate::transmute::checked_transmute_store::<__m128, [f32; 4usize]>(
+                    out1,
+                    &mut chunks[1],
+                );
+                crate::transmute::checked_transmute_store::<__m128, [f32; 4usize]>(
+                    out2,
+                    &mut chunks[2],
+                );
+                crate::transmute::checked_transmute_store::<__m128, [f32; 4usize]>(
+                    out3,
+                    &mut chunks[3],
+                );
+            }
+        );
+        kernel(self, a, dest)
     }
     #[inline(always)]
     fn reinterpret_u8_f32x16(self, a: f32x16<Self>) -> u8x64<Self> {
@@ -9222,65 +9245,88 @@ impl Simd for Avx2 {
     }
     #[inline(always)]
     fn load_interleaved_128_u8x64(self, src: &[u8; 64usize]) -> u8x64<Self> {
-        let (chunks, []) = src.as_chunks::<16usize>() else {
-            unreachable!()
-        };
-        let v0: __m128i =
-            crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[0]);
-        let v1: __m128i =
-            crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[1]);
-        let v2: __m128i =
-            crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[2]);
-        let v3: __m128i =
-            crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[3]);
-        unsafe {
-            let mask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
-            let v0 = _mm_shuffle_epi8(v0, mask);
-            let v1 = _mm_shuffle_epi8(v1, mask);
-            let v2 = _mm_shuffle_epi8(v2, mask);
-            let v3 = _mm_shuffle_epi8(v3, mask);
-            let tmp0 = _mm_unpacklo_epi32(v0, v1);
-            let tmp1 = _mm_unpackhi_epi32(v0, v1);
-            let tmp2 = _mm_unpacklo_epi32(v2, v3);
-            let tmp3 = _mm_unpackhi_epi32(v2, v3);
-            let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
-            let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
-            let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
-            let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
-            self.combine_u8x32(
-                self.combine_u8x16(out0.simd_into(self), out1.simd_into(self)),
-                self.combine_u8x16(out2.simd_into(self), out3.simd_into(self)),
-            )
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, src: &[u8; 64usize]) -> u8x64<Avx2> {
+                let (chunks, []) = src.as_chunks::<16usize>() else {
+                    unreachable!()
+                };
+                let v0: __m128i =
+                    crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[0]);
+                let v1: __m128i =
+                    crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[1]);
+                let v2: __m128i =
+                    crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[2]);
+                let v3: __m128i =
+                    crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[3]);
+                let mask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
+                let v0 = _mm_shuffle_epi8(v0, mask);
+                let v1 = _mm_shuffle_epi8(v1, mask);
+                let v2 = _mm_shuffle_epi8(v2, mask);
+                let v3 = _mm_shuffle_epi8(v3, mask);
+                let tmp0 = _mm_unpacklo_epi32(v0, v1);
+                let tmp1 = _mm_unpackhi_epi32(v0, v1);
+                let tmp2 = _mm_unpacklo_epi32(v2, v3);
+                let tmp3 = _mm_unpackhi_epi32(v2, v3);
+                let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
+                let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
+                let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
+                let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
+                token.combine_u8x32(
+                    token.combine_u8x16(out0.simd_into(token), out1.simd_into(token)),
+                    token.combine_u8x16(out2.simd_into(token), out3.simd_into(token)),
+                )
+            }
+        );
+        kernel(self, src)
     }
     #[inline(always)]
     fn store_interleaved_128_u8x64(self, a: u8x64<Self>, dest: &mut [u8; 64usize]) -> () {
-        let (v01, v23) = self.split_u8x64(a);
-        let (v0, v1) = self.split_u8x32(v01);
-        let (v2, v3) = self.split_u8x32(v23);
-        let v0 = v0.into();
-        let v1 = v1.into();
-        let v2 = v2.into();
-        let v3 = v3.into();
-        unsafe {
-            let tmp0 = _mm_unpacklo_epi32(v0, v1);
-            let tmp1 = _mm_unpackhi_epi32(v0, v1);
-            let tmp2 = _mm_unpacklo_epi32(v2, v3);
-            let tmp3 = _mm_unpackhi_epi32(v2, v3);
-            let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
-            let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
-            let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
-            let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
-            let mask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
-            let out0 = _mm_shuffle_epi8(out0, mask);
-            let out1 = _mm_shuffle_epi8(out1, mask);
-            let out2 = _mm_shuffle_epi8(out2, mask);
-            let out3 = _mm_shuffle_epi8(out3, mask);
-            _mm_storeu_si128(dest.as_mut_ptr() as *mut _, out0);
-            _mm_storeu_si128(dest.as_mut_ptr().add(16usize) as *mut _, out1);
-            _mm_storeu_si128(dest.as_mut_ptr().add(2 * 16usize) as *mut _, out2);
-            _mm_storeu_si128(dest.as_mut_ptr().add(3 * 16usize) as *mut _, out3);
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: u8x64<Avx2>, dest: &mut [u8; 64usize]) -> () {
+                let (v01, v23) = token.split_u8x64(a);
+                let (v0, v1) = token.split_u8x32(v01);
+                let (v2, v3) = token.split_u8x32(v23);
+                let v0 = v0.into();
+                let v1 = v1.into();
+                let v2 = v2.into();
+                let v3 = v3.into();
+                let tmp0 = _mm_unpacklo_epi32(v0, v1);
+                let tmp1 = _mm_unpackhi_epi32(v0, v1);
+                let tmp2 = _mm_unpacklo_epi32(v2, v3);
+                let tmp3 = _mm_unpackhi_epi32(v2, v3);
+                let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
+                let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
+                let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
+                let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
+                let mask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
+                let out0 = _mm_shuffle_epi8(out0, mask);
+                let out1 = _mm_shuffle_epi8(out1, mask);
+                let out2 = _mm_shuffle_epi8(out2, mask);
+                let out3 = _mm_shuffle_epi8(out3, mask);
+                let (chunks, []) = dest.as_chunks_mut::<16usize>() else {
+                    unreachable!()
+                };
+                crate::transmute::checked_transmute_store::<__m128i, [u8; 16usize]>(
+                    out0,
+                    &mut chunks[0],
+                );
+                crate::transmute::checked_transmute_store::<__m128i, [u8; 16usize]>(
+                    out1,
+                    &mut chunks[1],
+                );
+                crate::transmute::checked_transmute_store::<__m128i, [u8; 16usize]>(
+                    out2,
+                    &mut chunks[2],
+                );
+                crate::transmute::checked_transmute_store::<__m128i, [u8; 16usize]>(
+                    out3,
+                    &mut chunks[3],
+                );
+            }
+        );
+        kernel(self, a, dest)
     }
     #[inline(always)]
     fn reinterpret_u32_u8x64(self, a: u8x64<Self>) -> u32x16<Self> {
@@ -9974,65 +10020,88 @@ impl Simd for Avx2 {
     }
     #[inline(always)]
     fn load_interleaved_128_u16x32(self, src: &[u16; 32usize]) -> u16x32<Self> {
-        let (chunks, []) = src.as_chunks::<8usize>() else {
-            unreachable!()
-        };
-        let v0: __m128i =
-            crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[0]);
-        let v1: __m128i =
-            crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[1]);
-        let v2: __m128i =
-            crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[2]);
-        let v3: __m128i =
-            crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[3]);
-        unsafe {
-            let mask = _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
-            let v0 = _mm_shuffle_epi8(v0, mask);
-            let v1 = _mm_shuffle_epi8(v1, mask);
-            let v2 = _mm_shuffle_epi8(v2, mask);
-            let v3 = _mm_shuffle_epi8(v3, mask);
-            let tmp0 = _mm_unpacklo_epi32(v0, v1);
-            let tmp1 = _mm_unpackhi_epi32(v0, v1);
-            let tmp2 = _mm_unpacklo_epi32(v2, v3);
-            let tmp3 = _mm_unpackhi_epi32(v2, v3);
-            let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
-            let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
-            let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
-            let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
-            self.combine_u16x16(
-                self.combine_u16x8(out0.simd_into(self), out1.simd_into(self)),
-                self.combine_u16x8(out2.simd_into(self), out3.simd_into(self)),
-            )
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, src: &[u16; 32usize]) -> u16x32<Avx2> {
+                let (chunks, []) = src.as_chunks::<8usize>() else {
+                    unreachable!()
+                };
+                let v0: __m128i =
+                    crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[0]);
+                let v1: __m128i =
+                    crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[1]);
+                let v2: __m128i =
+                    crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[2]);
+                let v3: __m128i =
+                    crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[3]);
+                let mask = _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
+                let v0 = _mm_shuffle_epi8(v0, mask);
+                let v1 = _mm_shuffle_epi8(v1, mask);
+                let v2 = _mm_shuffle_epi8(v2, mask);
+                let v3 = _mm_shuffle_epi8(v3, mask);
+                let tmp0 = _mm_unpacklo_epi32(v0, v1);
+                let tmp1 = _mm_unpackhi_epi32(v0, v1);
+                let tmp2 = _mm_unpacklo_epi32(v2, v3);
+                let tmp3 = _mm_unpackhi_epi32(v2, v3);
+                let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
+                let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
+                let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
+                let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
+                token.combine_u16x16(
+                    token.combine_u16x8(out0.simd_into(token), out1.simd_into(token)),
+                    token.combine_u16x8(out2.simd_into(token), out3.simd_into(token)),
+                )
+            }
+        );
+        kernel(self, src)
     }
     #[inline(always)]
     fn store_interleaved_128_u16x32(self, a: u16x32<Self>, dest: &mut [u16; 32usize]) -> () {
-        let (v01, v23) = self.split_u16x32(a);
-        let (v0, v1) = self.split_u16x16(v01);
-        let (v2, v3) = self.split_u16x16(v23);
-        let v0 = v0.into();
-        let v1 = v1.into();
-        let v2 = v2.into();
-        let v3 = v3.into();
-        unsafe {
-            let tmp0 = _mm_unpacklo_epi32(v0, v1);
-            let tmp1 = _mm_unpackhi_epi32(v0, v1);
-            let tmp2 = _mm_unpacklo_epi32(v2, v3);
-            let tmp3 = _mm_unpackhi_epi32(v2, v3);
-            let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
-            let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
-            let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
-            let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
-            let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
-            let out0 = _mm_shuffle_epi8(out0, mask);
-            let out1 = _mm_shuffle_epi8(out1, mask);
-            let out2 = _mm_shuffle_epi8(out2, mask);
-            let out3 = _mm_shuffle_epi8(out3, mask);
-            _mm_storeu_si128(dest.as_mut_ptr() as *mut _, out0);
-            _mm_storeu_si128(dest.as_mut_ptr().add(8usize) as *mut _, out1);
-            _mm_storeu_si128(dest.as_mut_ptr().add(2 * 8usize) as *mut _, out2);
-            _mm_storeu_si128(dest.as_mut_ptr().add(3 * 8usize) as *mut _, out3);
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: u16x32<Avx2>, dest: &mut [u16; 32usize]) -> () {
+                let (v01, v23) = token.split_u16x32(a);
+                let (v0, v1) = token.split_u16x16(v01);
+                let (v2, v3) = token.split_u16x16(v23);
+                let v0 = v0.into();
+                let v1 = v1.into();
+                let v2 = v2.into();
+                let v3 = v3.into();
+                let tmp0 = _mm_unpacklo_epi32(v0, v1);
+                let tmp1 = _mm_unpackhi_epi32(v0, v1);
+                let tmp2 = _mm_unpacklo_epi32(v2, v3);
+                let tmp3 = _mm_unpackhi_epi32(v2, v3);
+                let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
+                let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
+                let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
+                let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
+                let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
+                let out0 = _mm_shuffle_epi8(out0, mask);
+                let out1 = _mm_shuffle_epi8(out1, mask);
+                let out2 = _mm_shuffle_epi8(out2, mask);
+                let out3 = _mm_shuffle_epi8(out3, mask);
+                let (chunks, []) = dest.as_chunks_mut::<8usize>() else {
+                    unreachable!()
+                };
+                crate::transmute::checked_transmute_store::<__m128i, [u16; 8usize]>(
+                    out0,
+                    &mut chunks[0],
+                );
+                crate::transmute::checked_transmute_store::<__m128i, [u16; 8usize]>(
+                    out1,
+                    &mut chunks[1],
+                );
+                crate::transmute::checked_transmute_store::<__m128i, [u16; 8usize]>(
+                    out2,
+                    &mut chunks[2],
+                );
+                crate::transmute::checked_transmute_store::<__m128i, [u16; 8usize]>(
+                    out3,
+                    &mut chunks[3],
+                );
+            }
+        );
+        kernel(self, a, dest)
     }
     #[inline(always)]
     fn narrow_u16x32(self, a: u16x32<Self>) -> u8x32<Self> {
@@ -10718,55 +10787,78 @@ impl Simd for Avx2 {
     }
     #[inline(always)]
     fn load_interleaved_128_u32x16(self, src: &[u32; 16usize]) -> u32x16<Self> {
-        let (chunks, []) = src.as_chunks::<4usize>() else {
-            unreachable!()
-        };
-        let v0: __m128i =
-            crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[0]);
-        let v1: __m128i =
-            crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[1]);
-        let v2: __m128i =
-            crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[2]);
-        let v3: __m128i =
-            crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[3]);
-        unsafe {
-            let tmp0 = _mm_unpacklo_epi32(v0, v1);
-            let tmp1 = _mm_unpackhi_epi32(v0, v1);
-            let tmp2 = _mm_unpacklo_epi32(v2, v3);
-            let tmp3 = _mm_unpackhi_epi32(v2, v3);
-            let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
-            let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
-            let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
-            let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
-            self.combine_u32x8(
-                self.combine_u32x4(out0.simd_into(self), out1.simd_into(self)),
-                self.combine_u32x4(out2.simd_into(self), out3.simd_into(self)),
-            )
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, src: &[u32; 16usize]) -> u32x16<Avx2> {
+                let (chunks, []) = src.as_chunks::<4usize>() else {
+                    unreachable!()
+                };
+                let v0: __m128i =
+                    crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[0]);
+                let v1: __m128i =
+                    crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[1]);
+                let v2: __m128i =
+                    crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[2]);
+                let v3: __m128i =
+                    crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[3]);
+                let tmp0 = _mm_unpacklo_epi32(v0, v1);
+                let tmp1 = _mm_unpackhi_epi32(v0, v1);
+                let tmp2 = _mm_unpacklo_epi32(v2, v3);
+                let tmp3 = _mm_unpackhi_epi32(v2, v3);
+                let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
+                let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
+                let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
+                let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
+                token.combine_u32x8(
+                    token.combine_u32x4(out0.simd_into(token), out1.simd_into(token)),
+                    token.combine_u32x4(out2.simd_into(token), out3.simd_into(token)),
+                )
+            }
+        );
+        kernel(self, src)
     }
     #[inline(always)]
     fn store_interleaved_128_u32x16(self, a: u32x16<Self>, dest: &mut [u32; 16usize]) -> () {
-        let (v01, v23) = self.split_u32x16(a);
-        let (v0, v1) = self.split_u32x8(v01);
-        let (v2, v3) = self.split_u32x8(v23);
-        let v0 = v0.into();
-        let v1 = v1.into();
-        let v2 = v2.into();
-        let v3 = v3.into();
-        unsafe {
-            let tmp0 = _mm_unpacklo_epi32(v0, v1);
-            let tmp1 = _mm_unpackhi_epi32(v0, v1);
-            let tmp2 = _mm_unpacklo_epi32(v2, v3);
-            let tmp3 = _mm_unpackhi_epi32(v2, v3);
-            let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
-            let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
-            let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
-            let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
-            _mm_storeu_si128(dest.as_mut_ptr() as *mut _, out0);
-            _mm_storeu_si128(dest.as_mut_ptr().add(4usize) as *mut _, out1);
-            _mm_storeu_si128(dest.as_mut_ptr().add(2 * 4usize) as *mut _, out2);
-            _mm_storeu_si128(dest.as_mut_ptr().add(3 * 4usize) as *mut _, out3);
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Avx2, a: u32x16<Avx2>, dest: &mut [u32; 16usize]) -> () {
+                let (v01, v23) = token.split_u32x16(a);
+                let (v0, v1) = token.split_u32x8(v01);
+                let (v2, v3) = token.split_u32x8(v23);
+                let v0 = v0.into();
+                let v1 = v1.into();
+                let v2 = v2.into();
+                let v3 = v3.into();
+                let tmp0 = _mm_unpacklo_epi32(v0, v1);
+                let tmp1 = _mm_unpackhi_epi32(v0, v1);
+                let tmp2 = _mm_unpacklo_epi32(v2, v3);
+                let tmp3 = _mm_unpackhi_epi32(v2, v3);
+                let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
+                let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
+                let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
+                let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
+                let (chunks, []) = dest.as_chunks_mut::<4usize>() else {
+                    unreachable!()
+                };
+                crate::transmute::checked_transmute_store::<__m128i, [u32; 4usize]>(
+                    out0,
+                    &mut chunks[0],
+                );
+                crate::transmute::checked_transmute_store::<__m128i, [u32; 4usize]>(
+                    out1,
+                    &mut chunks[1],
+                );
+                crate::transmute::checked_transmute_store::<__m128i, [u32; 4usize]>(
+                    out2,
+                    &mut chunks[2],
+                );
+                crate::transmute::checked_transmute_store::<__m128i, [u32; 4usize]>(
+                    out3,
+                    &mut chunks[3],
+                );
+            }
+        );
+        kernel(self, a, dest)
     }
     #[inline(always)]
     fn reinterpret_u8_u32x16(self, a: u32x16<Self>) -> u8x64<Self> {
diff --git a/fearless_simd/src/generated/sse4_2.rs b/fearless_simd/src/generated/sse4_2.rs
index 47b81a4d..7966c826 100644
--- a/fearless_simd/src/generated/sse4_2.rs
+++ b/fearless_simd/src/generated/sse4_2.rs
@@ -6950,55 +6950,78 @@ impl Simd for Sse4_2 {
     }
     #[inline(always)]
     fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16<Self> {
-        let (chunks, []) = src.as_chunks::<4usize>() else {
-            unreachable!()
-        };
-        let v0: __m128 =
-            crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[0]);
-        let v1: __m128 =
-            crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[1]);
-        let v2: __m128 =
-            crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[2]);
-        let v3: __m128 =
-            crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[3]);
-        unsafe {
-            let tmp0 = _mm_unpacklo_ps(v0, v1);
-            let tmp1 = _mm_unpackhi_ps(v0, v1);
-            let tmp2 = _mm_unpacklo_ps(v2, v3);
-            let tmp3 = _mm_unpackhi_ps(v2, v3);
-            let out0 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
-            let out1 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
-            let out2 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
-            let out3 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
-            self.combine_f32x8(
-                self.combine_f32x4(out0.simd_into(self), out1.simd_into(self)),
-                self.combine_f32x4(out2.simd_into(self), out3.simd_into(self)),
-            )
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Sse4_2, src: &[f32; 16usize]) -> f32x16<Sse4_2> {
+                let (chunks, []) = src.as_chunks::<4usize>() else {
+                    unreachable!()
+                };
+                let v0: __m128 =
+                    crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[0]);
+                let v1: __m128 =
+                    crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[1]);
+                let v2: __m128 =
+                    crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[2]);
+                let v3: __m128 =
+                    crate::transmute::checked_transmute_copy::<[f32; 4usize], __m128>(&chunks[3]);
+                let tmp0 = _mm_unpacklo_ps(v0, v1);
+                let tmp1 = _mm_unpackhi_ps(v0, v1);
+                let tmp2 = _mm_unpacklo_ps(v2, v3);
+                let tmp3 = _mm_unpackhi_ps(v2, v3);
+                let out0 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
+                let out1 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
+                let out2 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
+                let out3 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
+                token.combine_f32x8(
+                    token.combine_f32x4(out0.simd_into(token), out1.simd_into(token)),
+                    token.combine_f32x4(out2.simd_into(token), out3.simd_into(token)),
+                )
+            }
+        );
+        kernel(self, src)
     }
     #[inline(always)]
     fn store_interleaved_128_f32x16(self, a: f32x16<Self>, dest: &mut [f32; 16usize]) -> () {
-        let (v01, v23) = self.split_f32x16(a);
-        let (v0, v1) = self.split_f32x8(v01);
-        let (v2, v3) = self.split_f32x8(v23);
-        let v0 = v0.into();
-        let v1 = v1.into();
-        let v2 = v2.into();
-        let v3 = v3.into();
-        unsafe {
-            let tmp0 = _mm_unpacklo_ps(v0, v1);
-            let tmp1 = _mm_unpackhi_ps(v0, v1);
-            let tmp2 = _mm_unpacklo_ps(v2, v3);
-            let tmp3 = _mm_unpackhi_ps(v2, v3);
-            let out0 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
-            let out1 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
-            let out2 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
-            let out3 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
-            _mm_storeu_ps(dest.as_mut_ptr() as *mut _, out0);
-            _mm_storeu_ps(dest.as_mut_ptr().add(4usize) as *mut _, out1);
-            _mm_storeu_ps(dest.as_mut_ptr().add(2 * 4usize) as *mut _, out2);
-            _mm_storeu_ps(dest.as_mut_ptr().add(3 * 4usize) as *mut _, out3);
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Sse4_2, a: f32x16<Sse4_2>, dest: &mut [f32; 16usize]) -> () {
+                let (v01, v23) = token.split_f32x16(a);
+                let (v0, v1) = token.split_f32x8(v01);
+                let (v2, v3) = token.split_f32x8(v23);
+                let v0 = v0.into();
+                let v1 = v1.into();
+                let v2 = v2.into();
+                let v3 = v3.into();
+                let tmp0 = _mm_unpacklo_ps(v0, v1);
+                let tmp1 = _mm_unpackhi_ps(v0, v1);
+                let tmp2 = _mm_unpacklo_ps(v2, v3);
+                let tmp3 = _mm_unpackhi_ps(v2, v3);
+                let out0 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
+                let out1 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp0), _mm_castps_pd(tmp2)));
+                let out2 = _mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
+                let out3 = _mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(tmp1), _mm_castps_pd(tmp3)));
+                let (chunks, []) = dest.as_chunks_mut::<4usize>() else {
+                    unreachable!()
+                };
+                crate::transmute::checked_transmute_store::<__m128, [f32; 4usize]>(
+                    out0,
+                    &mut chunks[0],
+                );
+                crate::transmute::checked_transmute_store::<__m128, [f32; 4usize]>(
+                    out1,
+                    &mut chunks[1],
+                );
+                crate::transmute::checked_transmute_store::<__m128, [f32; 4usize]>(
+                    out2,
+                    &mut chunks[2],
+                );
+                crate::transmute::checked_transmute_store::<__m128, [f32; 4usize]>(
+                    out3,
+                    &mut chunks[3],
+                );
+            }
+        );
+        kernel(self, a, dest)
     }
     #[inline(always)]
     fn reinterpret_u8_f32x16(self, a: f32x16<Self>) -> u8x64<Self> {
@@ -7567,65 +7590,88 @@ impl Simd for Sse4_2 {
     }
     #[inline(always)]
     fn load_interleaved_128_u8x64(self, src: &[u8; 64usize]) -> u8x64<Self> {
-        let (chunks, []) = src.as_chunks::<16usize>() else {
-            unreachable!()
-        };
-        let v0: __m128i =
-            crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[0]);
-        let v1: __m128i =
-            crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[1]);
-        let v2: __m128i =
-            crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[2]);
-        let v3: __m128i =
-            crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[3]);
-        unsafe {
-            let mask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
-            let v0 = _mm_shuffle_epi8(v0, mask);
-            let v1 = _mm_shuffle_epi8(v1, mask);
-            let v2 = _mm_shuffle_epi8(v2, mask);
-            let v3 = _mm_shuffle_epi8(v3, mask);
-            let tmp0 = _mm_unpacklo_epi32(v0, v1);
-            let tmp1 = _mm_unpackhi_epi32(v0, v1);
-            let tmp2 = _mm_unpacklo_epi32(v2, v3);
-            let tmp3 = _mm_unpackhi_epi32(v2, v3);
-            let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
-            let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
-            let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
-            let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
-            self.combine_u8x32(
-                self.combine_u8x16(out0.simd_into(self), out1.simd_into(self)),
-                self.combine_u8x16(out2.simd_into(self), out3.simd_into(self)),
-            )
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Sse4_2, src: &[u8; 64usize]) -> u8x64<Sse4_2> {
+                let (chunks, []) = src.as_chunks::<16usize>() else {
+                    unreachable!()
+                };
+                let v0: __m128i =
+                    crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[0]);
+                let v1: __m128i =
+                    crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[1]);
+                let v2: __m128i =
+                    crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[2]);
+                let v3: __m128i =
+                    crate::transmute::checked_transmute_copy::<[u8; 16usize], __m128i>(&chunks[3]);
+                let mask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
+                let v0 = _mm_shuffle_epi8(v0, mask);
+                let v1 = _mm_shuffle_epi8(v1, mask);
+                let v2 = _mm_shuffle_epi8(v2, mask);
+                let v3 = _mm_shuffle_epi8(v3, mask);
+                let tmp0 = _mm_unpacklo_epi32(v0, v1);
+                let tmp1 = _mm_unpackhi_epi32(v0, v1);
+                let tmp2 = _mm_unpacklo_epi32(v2, v3);
+                let tmp3 = _mm_unpackhi_epi32(v2, v3);
+                let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
+                let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
+                let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
+                let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
+                token.combine_u8x32(
+                    token.combine_u8x16(out0.simd_into(token), out1.simd_into(token)),
+                    token.combine_u8x16(out2.simd_into(token), out3.simd_into(token)),
+                )
+            }
+        );
+        kernel(self, src)
     }
     #[inline(always)]
     fn store_interleaved_128_u8x64(self, a: u8x64<Self>, dest: &mut [u8; 64usize]) -> () {
-        let (v01, v23) = self.split_u8x64(a);
-        let (v0, v1) = self.split_u8x32(v01);
-        let (v2, v3) = self.split_u8x32(v23);
-        let v0 = v0.into();
-        let v1 = v1.into();
-        let v2 = v2.into();
-        let v3 = v3.into();
-        unsafe {
-            let tmp0 = _mm_unpacklo_epi32(v0, v1);
-            let tmp1 = _mm_unpackhi_epi32(v0, v1);
-            let tmp2 = _mm_unpacklo_epi32(v2, v3);
-            let tmp3 = _mm_unpackhi_epi32(v2, v3);
-            let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
-            let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
-            let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
-            let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
-            let mask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
-            let out0 = _mm_shuffle_epi8(out0, mask);
-            let out1 = _mm_shuffle_epi8(out1, mask);
-            let out2 = _mm_shuffle_epi8(out2, mask);
-            let out3 = _mm_shuffle_epi8(out3, mask);
-            _mm_storeu_si128(dest.as_mut_ptr() as *mut _, out0);
-            _mm_storeu_si128(dest.as_mut_ptr().add(16usize) as *mut _, out1);
-            _mm_storeu_si128(dest.as_mut_ptr().add(2 * 16usize) as *mut _, out2);
-            _mm_storeu_si128(dest.as_mut_ptr().add(3 * 16usize) as *mut _, out3);
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Sse4_2, a: u8x64<Sse4_2>, dest: &mut [u8; 64usize]) -> () {
+                let (v01, v23) = token.split_u8x64(a);
+                let (v0, v1) = token.split_u8x32(v01);
+                let (v2, v3) = token.split_u8x32(v23);
+                let v0 = v0.into();
+                let v1 = v1.into();
+                let v2 = v2.into();
+                let v3 = v3.into();
+                let tmp0 = _mm_unpacklo_epi32(v0, v1);
+                let tmp1 = _mm_unpackhi_epi32(v0, v1);
+                let tmp2 = _mm_unpacklo_epi32(v2, v3);
+                let tmp3 = _mm_unpackhi_epi32(v2, v3);
+                let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
+                let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
+                let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
+                let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
+                let mask = _mm_setr_epi8(0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15);
+                let out0 = _mm_shuffle_epi8(out0, mask);
+                let out1 = _mm_shuffle_epi8(out1, mask);
+                let out2 = _mm_shuffle_epi8(out2, mask);
+                let out3 = _mm_shuffle_epi8(out3, mask);
+                let (chunks, []) = dest.as_chunks_mut::<16usize>() else {
+                    unreachable!()
+                };
+                crate::transmute::checked_transmute_store::<__m128i, [u8; 16usize]>(
+                    out0,
+                    &mut chunks[0],
+                );
+                crate::transmute::checked_transmute_store::<__m128i, [u8; 16usize]>(
+                    out1,
+                    &mut chunks[1],
+                );
+                crate::transmute::checked_transmute_store::<__m128i, [u8; 16usize]>(
+                    out2,
+                    &mut chunks[2],
+                );
+                crate::transmute::checked_transmute_store::<__m128i, [u8; 16usize]>(
+                    out3,
+                    &mut chunks[3],
+                );
+            }
+        );
+        kernel(self, a, dest)
     }
     #[inline(always)]
     fn reinterpret_u32_u8x64(self, a: u8x64<Self>) -> u32x16<Self> {
@@ -8325,65 +8371,88 @@ impl Simd for Sse4_2 {
     }
     #[inline(always)]
     fn load_interleaved_128_u16x32(self, src: &[u16; 32usize]) -> u16x32<Self> {
-        let (chunks, []) = src.as_chunks::<8usize>() else {
-            unreachable!()
-        };
-        let v0: __m128i =
-            crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[0]);
-        let v1: __m128i =
-            crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[1]);
-        let v2: __m128i =
-            crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[2]);
-        let v3: __m128i =
-            crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[3]);
-        unsafe {
-            let mask = _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
-            let v0 = _mm_shuffle_epi8(v0, mask);
-            let v1 = _mm_shuffle_epi8(v1, mask);
-            let v2 = _mm_shuffle_epi8(v2, mask);
-            let v3 = _mm_shuffle_epi8(v3, mask);
-            let tmp0 = _mm_unpacklo_epi32(v0, v1);
-            let tmp1 = _mm_unpackhi_epi32(v0, v1);
-            let tmp2 = _mm_unpacklo_epi32(v2, v3);
-            let tmp3 = _mm_unpackhi_epi32(v2, v3);
-            let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
-            let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
-            let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
-            let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
-            self.combine_u16x16(
-                self.combine_u16x8(out0.simd_into(self), out1.simd_into(self)),
-                self.combine_u16x8(out2.simd_into(self), out3.simd_into(self)),
-            )
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Sse4_2, src: &[u16; 32usize]) -> u16x32<Sse4_2> {
+                let (chunks, []) = src.as_chunks::<8usize>() else {
+                    unreachable!()
+                };
+                let v0: __m128i =
+                    crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[0]);
+                let v1: __m128i =
+                    crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[1]);
+                let v2: __m128i =
+                    crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[2]);
+                let v3: __m128i =
+                    crate::transmute::checked_transmute_copy::<[u16; 8usize], __m128i>(&chunks[3]);
+                let mask = _mm_setr_epi8(0, 1, 8, 9, 2, 3, 10, 11, 4, 5, 12, 13, 6, 7, 14, 15);
+                let v0 = _mm_shuffle_epi8(v0, mask);
+                let v1 = _mm_shuffle_epi8(v1, mask);
+                let v2 = _mm_shuffle_epi8(v2, mask);
+                let v3 = _mm_shuffle_epi8(v3, mask);
+                let tmp0 = _mm_unpacklo_epi32(v0, v1);
+                let tmp1 = _mm_unpackhi_epi32(v0, v1);
+                let tmp2 = _mm_unpacklo_epi32(v2, v3);
+                let tmp3 = _mm_unpackhi_epi32(v2, v3);
+                let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
+                let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
+                let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
+                let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
+                token.combine_u16x16(
+                    token.combine_u16x8(out0.simd_into(token), out1.simd_into(token)),
+                    token.combine_u16x8(out2.simd_into(token), out3.simd_into(token)),
+                )
+            }
+        );
+        kernel(self, src)
     }
     #[inline(always)]
     fn store_interleaved_128_u16x32(self, a: u16x32<Self>, dest: &mut [u16; 32usize]) -> () {
-        let (v01, v23) = self.split_u16x32(a);
-        let (v0, v1) = self.split_u16x16(v01);
-        let (v2, v3) = self.split_u16x16(v23);
-        let v0 = v0.into();
-        let v1 = v1.into();
-        let v2 = v2.into();
-        let v3 = v3.into();
-        unsafe {
-            let tmp0 = _mm_unpacklo_epi32(v0, v1);
-            let tmp1 = _mm_unpackhi_epi32(v0, v1);
-            let tmp2 = _mm_unpacklo_epi32(v2, v3);
-            let tmp3 = _mm_unpackhi_epi32(v2, v3);
-            let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
-            let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
-            let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
-            let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
-            let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
-            let out0 = _mm_shuffle_epi8(out0, mask);
-            let out1 = _mm_shuffle_epi8(out1, mask);
-            let out2 = _mm_shuffle_epi8(out2, mask);
-            let out3 = _mm_shuffle_epi8(out3, mask);
-            _mm_storeu_si128(dest.as_mut_ptr() as *mut _, out0);
-            _mm_storeu_si128(dest.as_mut_ptr().add(8usize) as *mut _, out1);
-            _mm_storeu_si128(dest.as_mut_ptr().add(2 * 8usize) as *mut _, out2);
-            _mm_storeu_si128(dest.as_mut_ptr().add(3 * 8usize) as *mut _, out3);
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Sse4_2, a: u16x32<Sse4_2>, dest: &mut [u16; 32usize]) -> () {
+                let (v01, v23) = token.split_u16x32(a);
+                let (v0, v1) = token.split_u16x16(v01);
+                let (v2, v3) = token.split_u16x16(v23);
+                let v0 = v0.into();
+                let v1 = v1.into();
+                let v2 = v2.into();
+                let v3 = v3.into();
+                let tmp0 = _mm_unpacklo_epi32(v0, v1);
+                let tmp1 = _mm_unpackhi_epi32(v0, v1);
+                let tmp2 = _mm_unpacklo_epi32(v2, v3);
+                let tmp3 = _mm_unpackhi_epi32(v2, v3);
+                let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
+                let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
+                let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
+                let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
+                let mask = _mm_setr_epi8(0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15);
+                let out0 = _mm_shuffle_epi8(out0, mask);
+                let out1 = _mm_shuffle_epi8(out1, mask);
+                let out2 = _mm_shuffle_epi8(out2, mask);
+                let out3 = _mm_shuffle_epi8(out3, mask);
+                let (chunks, []) = dest.as_chunks_mut::<8usize>() else {
+                    unreachable!()
+                };
+                crate::transmute::checked_transmute_store::<__m128i, [u16; 8usize]>(
+                    out0,
+                    &mut chunks[0],
+                );
+                crate::transmute::checked_transmute_store::<__m128i, [u16; 8usize]>(
+                    out1,
+                    &mut chunks[1],
+                );
+                crate::transmute::checked_transmute_store::<__m128i, [u16; 8usize]>(
+                    out2,
+                    &mut chunks[2],
+                );
+                crate::transmute::checked_transmute_store::<__m128i, [u16; 8usize]>(
+                    out3,
+                    &mut chunks[3],
+                );
+            }
+        );
+        kernel(self, a, dest)
     }
     #[inline(always)]
     fn narrow_u16x32(self, a: u16x32<Self>) -> u8x32<Self> {
@@ -9057,55 +9126,78 @@ impl Simd for Sse4_2 {
     }
     #[inline(always)]
     fn load_interleaved_128_u32x16(self, src: &[u32; 16usize]) -> u32x16<Self> {
-        let (chunks, []) = src.as_chunks::<4usize>() else {
-            unreachable!()
-        };
-        let v0: __m128i =
-            crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[0]);
-        let v1: __m128i =
-            crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[1]);
-        let v2: __m128i =
-            crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[2]);
-        let v3: __m128i =
-            crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[3]);
-        unsafe {
-            let tmp0 = _mm_unpacklo_epi32(v0, v1);
-            let tmp1 = _mm_unpackhi_epi32(v0, v1);
-            let tmp2 = _mm_unpacklo_epi32(v2, v3);
-            let tmp3 = _mm_unpackhi_epi32(v2, v3);
-            let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
-            let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
-            let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
-            let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
-            self.combine_u32x8(
-                self.combine_u32x4(out0.simd_into(self), out1.simd_into(self)),
-                self.combine_u32x4(out2.simd_into(self), out3.simd_into(self)),
-            )
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Sse4_2, src: &[u32; 16usize]) -> u32x16<Sse4_2> {
+                let (chunks, []) = src.as_chunks::<4usize>() else {
+                    unreachable!()
+                };
+                let v0: __m128i =
+                    crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[0]);
+                let v1: __m128i =
+                    crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[1]);
+                let v2: __m128i =
+                    crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[2]);
+                let v3: __m128i =
+                    crate::transmute::checked_transmute_copy::<[u32; 4usize], __m128i>(&chunks[3]);
+                let tmp0 = _mm_unpacklo_epi32(v0, v1);
+                let tmp1 = _mm_unpackhi_epi32(v0, v1);
+                let tmp2 = _mm_unpacklo_epi32(v2, v3);
+                let tmp3 = _mm_unpackhi_epi32(v2, v3);
+                let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
+                let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
+                let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
+                let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
+                token.combine_u32x8(
+                    token.combine_u32x4(out0.simd_into(token), out1.simd_into(token)),
+                    token.combine_u32x4(out2.simd_into(token), out3.simd_into(token)),
+                )
+            }
+        );
+        kernel(self, src)
     }
     #[inline(always)]
     fn store_interleaved_128_u32x16(self, a: u32x16<Self>, dest: &mut [u32; 16usize]) -> () {
-        let (v01, v23) = self.split_u32x16(a);
-        let (v0, v1) = self.split_u32x8(v01);
-        let (v2, v3) = self.split_u32x8(v23);
-        let v0 = v0.into();
-        let v1 = v1.into();
-        let v2 = v2.into();
-        let v3 = v3.into();
-        unsafe {
-            let tmp0 = _mm_unpacklo_epi32(v0, v1);
-            let tmp1 = _mm_unpackhi_epi32(v0, v1);
-            let tmp2 = _mm_unpacklo_epi32(v2, v3);
-            let tmp3 = _mm_unpackhi_epi32(v2, v3);
-            let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
-            let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
-            let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
-            let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
-            _mm_storeu_si128(dest.as_mut_ptr() as *mut _, out0);
-            _mm_storeu_si128(dest.as_mut_ptr().add(4usize) as *mut _, out1);
-            _mm_storeu_si128(dest.as_mut_ptr().add(2 * 4usize) as *mut _, out2);
-            _mm_storeu_si128(dest.as_mut_ptr().add(3 * 4usize) as *mut _, out3);
-        }
+        crate::kernel!(
+            #[inline(always)]
+            fn kernel(token: Sse4_2, a: u32x16<Sse4_2>, dest: &mut [u32; 16usize]) -> () {
+                let (v01, v23) = token.split_u32x16(a);
+                let (v0, v1) = token.split_u32x8(v01);
+                let (v2, v3) = token.split_u32x8(v23);
+                let v0 = v0.into();
+                let v1 = v1.into();
+                let v2 = v2.into();
+                let v3 = v3.into();
+                let tmp0 = _mm_unpacklo_epi32(v0, v1);
+                let tmp1 = _mm_unpackhi_epi32(v0, v1);
+                let tmp2 = _mm_unpacklo_epi32(v2, v3);
+                let tmp3 = _mm_unpackhi_epi32(v2, v3);
+                let out0 = _mm_unpacklo_epi64(tmp0, tmp2);
+                let out1 = _mm_unpackhi_epi64(tmp0, tmp2);
+                let out2 = _mm_unpacklo_epi64(tmp1, tmp3);
+                let out3 = _mm_unpackhi_epi64(tmp1, tmp3);
+                let (chunks, []) = dest.as_chunks_mut::<4usize>() else {
+                    unreachable!()
+                };
+                crate::transmute::checked_transmute_store::<__m128i, [u32; 4usize]>(
+                    out0,
+                    &mut chunks[0],
+                );
+                crate::transmute::checked_transmute_store::<__m128i, [u32; 4usize]>(
+                    out1,
+                    &mut chunks[1],
+                );
+                crate::transmute::checked_transmute_store::<__m128i, [u32; 4usize]>(
+                    out2,
+                    &mut chunks[2],
+                );
+                crate::transmute::checked_transmute_store::<__m128i, [u32; 4usize]>(
+                    out3,
+                    &mut chunks[3],
+                );
+            }
+        );
+        kernel(self, a, dest)
     }
     #[inline(always)]
     fn reinterpret_u8_u32x16(self, a: u32x16<Self>) -> u8x64<Self> {
diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs
index ddc07ab5..9b47d135 100644
--- a/fearless_simd_gen/src/mk_x86.rs
+++ b/fearless_simd_gen/src/mk_x86.rs
@@ -219,11 +219,11 @@ impl Level for X86 {
             OpSig::LoadInterleaved {
                 block_size,
                 block_count,
-            } => self.handle_load_interleaved(method_sig, vec_ty, block_size, block_count),
+            } => self.handle_load_interleaved(op, vec_ty, block_size, block_count),
             OpSig::StoreInterleaved {
                 block_size,
                 block_count,
-            } => self.handle_store_interleaved(method_sig, vec_ty, block_size, block_count),
+            } => self.handle_store_interleaved(op, vec_ty, block_size, block_count),
             OpSig::FromArray { kind } => generic_from_array(method_sig, vec_ty, kind),
             OpSig::AsArray { kind } => {
                 generic_as_array(method_sig, vec_ty, kind, self.max_block_size(), |vec_ty| {
@@ -1781,7 +1781,7 @@ impl X86 {
 
     pub(crate) fn handle_load_interleaved(
         &self,
-        method_sig: TokenStream,
+        op: Op,
         vec_ty: &VecType,
         block_size: u16,
         block_count: u16,
@@ -1791,7 +1791,7 @@ impl X86 {
             "only 128-bit blocks are currently supported"
         );
         assert_eq!(block_count, 4, "only count of 4 is currently supported");
-        let expr = match vec_ty.scalar_bits {
+        match vec_ty.scalar_bits {
             32 | 16 | 8 => {
                 let block_ty =
                     VecType::new(vec_ty.scalar, vec_ty.scalar_bits, 128 / vec_ty.scalar_bits);
@@ -1876,24 +1876,24 @@ impl X86 {
                     }
                 };
 
-                quote! {
-                    let (chunks, []) = src.as_chunks::<#block_len>() else {
-                        unreachable!()
-                    };
-                    let v0: #native_ty = crate::transmute::checked_transmute_copy::<[#scalar_ty; #block_len], #native_ty>(
-                        &chunks[0],
-                    );
-                    let v1: #native_ty = crate::transmute::checked_transmute_copy::<[#scalar_ty; #block_len], #native_ty>(
-                        &chunks[1],
-                    );
-                    let v2: #native_ty = crate::transmute::checked_transmute_copy::<[#scalar_ty; #block_len], #native_ty>(
-                        &chunks[2],
-                    );
-                    let v3: #native_ty = crate::transmute::checked_transmute_copy::<[#scalar_ty; #block_len], #native_ty>(
-                        &chunks[3],
-                    );
+                self.kernel_method(op, vec_ty, |token| {
+                    quote! {
+                        let (chunks, []) = src.as_chunks::<#block_len>() else {
+                            unreachable!()
+                        };
+                        let v0: #native_ty = crate::transmute::checked_transmute_copy::<[#scalar_ty; #block_len], #native_ty>(
+                            &chunks[0],
+                        );
+                        let v1: #native_ty = crate::transmute::checked_transmute_copy::<[#scalar_ty; #block_len], #native_ty>(
+                            &chunks[1],
+                        );
+                        let v2: #native_ty = crate::transmute::checked_transmute_copy::<[#scalar_ty; #block_len], #native_ty>(
+                            &chunks[2],
+                        );
+                        let v3: #native_ty = crate::transmute::checked_transmute_copy::<[#scalar_ty; #block_len], #native_ty>(
+                            &chunks[3],
+                        );
 
-                    unsafe {
                         #init_shuffle
 
                         let tmp0 = #unpacklo_32(v0, v1); // [0,4,1,5]
@@ -1903,26 +1903,20 @@ impl X86 {
 
                         #final_unpack
 
-                        self.#combine_full(
-                            self.#combine_half(out0.simd_into(self), out1.simd_into(self)),
-                            self.#combine_half(out2.simd_into(self), out3.simd_into(self)),
+                        #token.#combine_full(
+                            #token.#combine_half(out0.simd_into(#token), out1.simd_into(#token)),
+                            #token.#combine_half(out2.simd_into(#token), out3.simd_into(#token)),
                         )
                     }
-                }
+                })
             }
             _ => unimplemented!(),
-        };
-
-        quote! {
-            #method_sig {
-                #expr
-            }
         }
     }
 
     pub(crate) fn handle_store_interleaved(
         &self,
-        method_sig: TokenStream,
+        op: Op,
         vec_ty: &VecType,
         block_size: u16,
         block_count: u16,
@@ -1932,12 +1926,12 @@ impl X86 {
             "only 128-bit blocks are currently supported"
         );
         assert_eq!(block_count, 4, "only count of 4 is currently supported");
-        let expr = match vec_ty.scalar_bits {
+        match vec_ty.scalar_bits {
             32 | 16 | 8 => {
                 let block_ty =
                     VecType::new(vec_ty.scalar, vec_ty.scalar_bits, 128 / vec_ty.scalar_bits);
-                let store_unaligned =
-                    intrinsic_ident("storeu", coarse_type(&block_ty), block_ty.n_bits());
+                let scalar_ty = block_ty.scalar.rust(block_ty.scalar_bits);
+                let native_ty = self.arch_ty(&block_ty);
                 let vec_32 = block_ty.reinterpret(block_ty.scalar, 32);
                 let unpacklo_32 = simple_sign_unaware_intrinsic("unpacklo", &vec_32);
                 let unpackhi_32 = simple_sign_unaware_intrinsic("unpackhi", &vec_32);
@@ -2015,16 +2009,16 @@ impl X86 {
                     }
                 };
 
-                quote! {
-                    let (v01, v23) = self.#split_full(a);
-                    let (v0, v1) = self.#split_half(v01);
-                    let (v2, v3) = self.#split_half(v23);
-                    let v0 = v0.into();
-                    let v1 = v1.into();
-                    let v2 = v2.into();
-                    let v3 = v3.into();
+                self.kernel_method(op, vec_ty, |token| {
+                    quote! {
+                        let (v01, v23) = #token.#split_full(a);
+                        let (v0, v1) = #token.#split_half(v01);
+                        let (v2, v3) = #token.#split_half(v23);
+                        let v0 = v0.into();
+                        let v1 = v1.into();
+                        let v2 = v2.into();
+                        let v3 = v3.into();
 
-                    unsafe {
                         let tmp0 = #unpacklo_32(v0, v1); // [0,4,1,5]
                         let tmp1 = #unpackhi_32(v0, v1); // [2,6,3,7]
                         let tmp2 = #unpacklo_32(v2, v3); // [8,12,9,13]
@@ -2034,20 +2028,18 @@ impl X86 {
 
                         #post_shuffle
 
-                        #store_unaligned(dest.as_mut_ptr() as *mut _, out0);
-                        #store_unaligned(dest.as_mut_ptr().add(#block_len) as *mut _, out1);
-                        #store_unaligned(dest.as_mut_ptr().add(2 * #block_len) as *mut _, out2);
-                        #store_unaligned(dest.as_mut_ptr().add(3 * #block_len) as *mut _, out3);
+                        let (chunks, []) = dest.as_chunks_mut::<#block_len>() else {
+                            unreachable!()
+                        };
+
+                        crate::transmute::checked_transmute_store::<#native_ty, [#scalar_ty; #block_len]>(out0, &mut chunks[0]);
+                        crate::transmute::checked_transmute_store::<#native_ty, [#scalar_ty; #block_len]>(out1, &mut chunks[1]);
+                        crate::transmute::checked_transmute_store::<#native_ty, [#scalar_ty; #block_len]>(out2, &mut chunks[2]);
+                        crate::transmute::checked_transmute_store::<#native_ty, [#scalar_ty; #block_len]>(out3, &mut chunks[3]);
                     }
-                }
+                })
             }
             _ => unimplemented!(),
-        };
-
-        quote! {
-            #method_sig {
-                #expr
-            }
         }
     }
 

From c40587b476191f42089d156a5b8315a5dbc19e03 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Tue, 16 Jun 2026 22:47:21 +0100
Subject: [PATCH 3/5] Placate Clippy

---
 fearless_simd/src/generated/avx2.rs       |  8 ++++----
 fearless_simd/src/generated/simd_trait.rs | 14 +++++++-------
 fearless_simd/src/generated/sse4_2.rs     |  8 ++++----
 fearless_simd_gen/src/ops.rs              | 10 ++++++++--
 4 files changed, 23 insertions(+), 17 deletions(-)

diff --git a/fearless_simd/src/generated/avx2.rs b/fearless_simd/src/generated/avx2.rs
index fb22baec..361a98b2 100644
--- a/fearless_simd/src/generated/avx2.rs
+++ b/fearless_simd/src/generated/avx2.rs
@@ -8676,7 +8676,7 @@ impl Simd for Avx2 {
                 );
             }
         );
-        kernel(self, a, dest)
+        kernel(self, a, dest);
     }
     #[inline(always)]
     fn reinterpret_u8_f32x16(self, a: f32x16<Self>) -> u8x64<Self> {
@@ -9326,7 +9326,7 @@ impl Simd for Avx2 {
                 );
             }
         );
-        kernel(self, a, dest)
+        kernel(self, a, dest);
     }
     #[inline(always)]
     fn reinterpret_u32_u8x64(self, a: u8x64<Self>) -> u32x16<Self> {
@@ -10101,7 +10101,7 @@ impl Simd for Avx2 {
                 );
             }
         );
-        kernel(self, a, dest)
+        kernel(self, a, dest);
     }
     #[inline(always)]
     fn narrow_u16x32(self, a: u16x32<Self>) -> u8x32<Self> {
@@ -10858,7 +10858,7 @@ impl Simd for Avx2 {
                 );
             }
         );
-        kernel(self, a, dest)
+        kernel(self, a, dest);
     }
     #[inline(always)]
     fn reinterpret_u8_u32x16(self, a: u32x16<Self>) -> u8x64<Self> {
diff --git a/fearless_simd/src/generated/simd_trait.rs b/fearless_simd/src/generated/simd_trait.rs
index 1510e274..d68be3a4 100644
--- a/fearless_simd/src/generated/simd_trait.rs
+++ b/fearless_simd/src/generated/simd_trait.rs
@@ -150,7 +150,7 @@ pub trait Simd:
     fn neg_f32x4(self, a: f32x4<Self>) -> f32x4<Self>;
     #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
     fn sqrt_f32x4(self, a: f32x4<Self>) -> f32x4<Self>;
-    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
+    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
     fn approximate_recip_f32x4(self, a: f32x4<Self>) -> f32x4<Self>;
     #[doc = "Add two vectors element-wise."]
     fn add_f32x4(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self>;
@@ -869,7 +869,7 @@ pub trait Simd:
     fn neg_f64x2(self, a: f64x2<Self>) -> f64x2<Self>;
     #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
     fn sqrt_f64x2(self, a: f64x2<Self>) -> f64x2<Self>;
-    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
+    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
     fn approximate_recip_f64x2(self, a: f64x2<Self>) -> f64x2<Self>;
     #[doc = "Add two vectors element-wise."]
     fn add_f64x2(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self>;
@@ -1000,7 +1000,7 @@ pub trait Simd:
     fn neg_f32x8(self, a: f32x8<Self>) -> f32x8<Self>;
     #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
     fn sqrt_f32x8(self, a: f32x8<Self>) -> f32x8<Self>;
-    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
+    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
     fn approximate_recip_f32x8(self, a: f32x8<Self>) -> f32x8<Self>;
     #[doc = "Add two vectors element-wise."]
     fn add_f32x8(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self>;
@@ -1741,7 +1741,7 @@ pub trait Simd:
     fn neg_f64x4(self, a: f64x4<Self>) -> f64x4<Self>;
     #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
     fn sqrt_f64x4(self, a: f64x4<Self>) -> f64x4<Self>;
-    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
+    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
     fn approximate_recip_f64x4(self, a: f64x4<Self>) -> f64x4<Self>;
     #[doc = "Add two vectors element-wise."]
     fn add_f64x4(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self>;
@@ -1876,7 +1876,7 @@ pub trait Simd:
     fn neg_f32x16(self, a: f32x16<Self>) -> f32x16<Self>;
     #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
     fn sqrt_f32x16(self, a: f32x16<Self>) -> f32x16<Self>;
-    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
+    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
     fn approximate_recip_f32x16(self, a: f32x16<Self>) -> f32x16<Self>;
     #[doc = "Add two vectors element-wise."]
     fn add_f32x16(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self>;
@@ -2611,7 +2611,7 @@ pub trait Simd:
     fn neg_f64x8(self, a: f64x8<Self>) -> f64x8<Self>;
     #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
     fn sqrt_f64x8(self, a: f64x8<Self>) -> f64x8<Self>;
-    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
+    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
     fn approximate_recip_f64x8(self, a: f64x8<Self>) -> f64x8<Self>;
     #[doc = "Add two vectors element-wise."]
     fn add_f64x8(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self>;
@@ -2862,7 +2862,7 @@ pub trait SimdFloat<S: Simd>:
     fn abs(self) -> Self;
     #[doc = "Compute the square root of each element.\n\nNegative elements other than `-0.0` will become NaN."]
     fn sqrt(self) -> Self;
-    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
+    #[doc = "Compute an approximate reciprocal (`1. / x`) for each element.\n\nThis uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\nOn x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. The precision of this operation may change as new platform support is added."]
     fn approximate_recip(self) -> Self;
     #[doc = "Return a vector with the magnitude of `self` and the sign of `rhs` for each element.\n\nThis operation copies the sign bit, so if an input element is NaN, the output element will be a NaN with the same payload and a copied sign bit."]
     fn copysign(self, rhs: impl SimdInto<Self, S>) -> Self;
diff --git a/fearless_simd/src/generated/sse4_2.rs b/fearless_simd/src/generated/sse4_2.rs
index 7966c826..64d93783 100644
--- a/fearless_simd/src/generated/sse4_2.rs
+++ b/fearless_simd/src/generated/sse4_2.rs
@@ -7021,7 +7021,7 @@ impl Simd for Sse4_2 {
                 );
             }
         );
-        kernel(self, a, dest)
+        kernel(self, a, dest);
     }
     #[inline(always)]
     fn reinterpret_u8_f32x16(self, a: f32x16<Self>) -> u8x64<Self> {
@@ -7671,7 +7671,7 @@ impl Simd for Sse4_2 {
                 );
             }
         );
-        kernel(self, a, dest)
+        kernel(self, a, dest);
     }
     #[inline(always)]
     fn reinterpret_u32_u8x64(self, a: u8x64<Self>) -> u32x16<Self> {
@@ -8452,7 +8452,7 @@ impl Simd for Sse4_2 {
                 );
             }
         );
-        kernel(self, a, dest)
+        kernel(self, a, dest);
     }
     #[inline(always)]
     fn narrow_u16x32(self, a: u16x32<Self>) -> u8x32<Self> {
@@ -9197,7 +9197,7 @@ impl Simd for Sse4_2 {
                 );
             }
         );
-        kernel(self, a, dest)
+        kernel(self, a, dest);
     }
     #[inline(always)]
     fn reinterpret_u8_u32x16(self, a: u32x16<Self>) -> u8x64<Self> {
diff --git a/fearless_simd_gen/src/ops.rs b/fearless_simd_gen/src/ops.rs
index 3ef04fb0..2cd2659e 100644
--- a/fearless_simd_gen/src/ops.rs
+++ b/fearless_simd_gen/src/ops.rs
@@ -227,6 +227,12 @@ impl Op {
         let arg_decls = sig.arg_decls();
         let call_args = &sig.arg_names;
         let ret = &sig.ret;
+        let kernel_call = if matches!(self.sig, OpSig::StoreInterleaved { .. } | OpSig::StoreArray)
+        {
+            quote! { kernel(self #(, #call_args)*); }
+        } else {
+            quote! { kernel(self #(, #call_args)*) }
+        };
 
         quote! {
             #method_sig {
@@ -237,7 +243,7 @@ impl Op {
                     }
                 );
 
-                kernel(self #(, #call_args)*)
+                #kernel_call
             }
         }
     }
@@ -641,7 +647,7 @@ const FLOAT_OPS: &[Op] = &[
         "Compute an approximate reciprocal (`1. / x`) for each element.\n\n\
          This uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\n\
          On x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. \
-         On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. \
+         On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. \
          The precision of this operation may change as new platform support is added.",
     ),
     Op::new(

From 1de7ad04bf66c177a7412c3905cfcf572acb038c Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Wed, 17 Jun 2026 15:54:38 +0100
Subject: [PATCH 4/5] Add a test for load_interleaved_128_f32x16() which was
 not previously covered

---
 fearless_simd_tests/tests/harness/mod.rs | 44 ++++++++++++++++++++++++
 1 file changed, 44 insertions(+)

diff --git a/fearless_simd_tests/tests/harness/mod.rs b/fearless_simd_tests/tests/harness/mod.rs
index 727b5b07..4ef94926 100644
--- a/fearless_simd_tests/tests/harness/mod.rs
+++ b/fearless_simd_tests/tests/harness/mod.rs
@@ -816,6 +816,50 @@ fn all_false_mask8x16<S: Simd>(simd: S) {
     assert!(!simd.all_false_mask8x16(one_neg));
 }
 
+#[simd_test]
+fn load_interleaved_128_f32x16<S: Simd>(simd: S) {
+    let data = [
+        0.0,
+        4.0,
+        8.0,
+        f32::MIN,
+        f32::NAN,
+        -0.0,
+        9.0,
+        13.0,
+        f32::INFINITY,
+        6.0,
+        -10.0,
+        f32::MAX,
+        -3.0,
+        f32::NEG_INFINITY,
+        11.0,
+        15.0,
+    ];
+    let expected = [
+        0.0,
+        f32::NAN,
+        f32::INFINITY,
+        -3.0,
+        4.0,
+        -0.0,
+        6.0,
+        f32::NEG_INFINITY,
+        8.0,
+        9.0,
+        -10.0,
+        11.0,
+        f32::MIN,
+        13.0,
+        f32::MAX,
+        15.0,
+    ];
+
+    // Note: f32::NAN != f32::NAN hence we compare the bit pattern.
+    let result = simd.load_interleaved_128_f32x16(&data);
+    assert_eq!((*result).map(f32::to_bits), expected.map(f32::to_bits),);
+}
+
 #[simd_test]
 fn load_interleaved_128_u32x16<S: Simd>(simd: S) {
     #[rustfmt::skip]

From aa6b43cd14495280956b72427a3c8e79c1f7957c Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Wed, 17 Jun 2026 15:59:06 +0100
Subject: [PATCH 5/5] Clarify documentation on
 load_interleaved/store_interleaved

---
 fearless_simd/src/generated/simd_trait.rs | 16 ++++++++--------
 fearless_simd_gen/src/ops.rs              | 16 ++++++++++++++--
 2 files changed, 22 insertions(+), 10 deletions(-)

diff --git a/fearless_simd/src/generated/simd_trait.rs b/fearless_simd/src/generated/simd_trait.rs
index d68be3a4..bcef3ed5 100644
--- a/fearless_simd/src/generated/simd_trait.rs
+++ b/fearless_simd/src/generated/simd_trait.rs
@@ -1940,9 +1940,9 @@ pub trait Simd:
     fn reinterpret_f64_f32x16(self, a: f32x16<Self>) -> f64x8<Self>;
     #[doc = "Reinterpret the bits of this vector as a vector of `i32` elements.\n\nThis is a bitwise reinterpretation only, and does not perform any conversions."]
     fn reinterpret_i32_f32x16(self, a: f32x16<Self>) -> i32x16<Self>;
-    #[doc = "Load elements from an array with 4-way interleaving.\n\nReads consecutive elements and deinterleaves them into a single vector."]
+    #[doc = "Load elements from an array with 4-way interleaving.\n\nThis is different from loading a vector and calling `interleave`: `interleave` combines two already-loaded vectors, while this operation treats memory as four consecutive 128-bit blocks and transposes those blocks into one vector.\n\nFor example, with 32-bit lanes, memory laid out as `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]` loads as `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]`."]
     fn load_interleaved_128_f32x16(self, src: &[f32; 16usize]) -> f32x16<Self>;
-    #[doc = "Store elements to an array with 4-way interleaving.\n\nInterleaves the vector elements and writes them consecutively to memory."]
+    #[doc = "Store elements to an array with 4-way interleaving.\n\nThis is the inverse of `load_interleaved_128`. It is different from calling `interleave` and then storing: `interleave` combines two already-loaded vectors, while this operation transposes one vector into four consecutive 128-bit blocks in memory.\n\nFor example, with 32-bit lanes, a vector containing `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]` stores as `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]`."]
     fn store_interleaved_128_f32x16(self, a: f32x16<Self>, dest: &mut [f32; 16usize]) -> ();
     #[doc = "Reinterpret the bits of this vector as a vector of `u8` elements.\n\nThe total bit width is preserved; the number of elements changes accordingly."]
     fn reinterpret_u8_f32x16(self, a: f32x16<Self>) -> u8x64<Self>;
@@ -2118,9 +2118,9 @@ pub trait Simd:
     fn max_u8x64(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self>;
     #[doc = "Split a vector into two vectors of half the width.\n\nReturns a tuple of (lower half, upper half)."]
     fn split_u8x64(self, a: u8x64<Self>) -> (u8x32<Self>, u8x32<Self>);
-    #[doc = "Load elements from an array with 4-way interleaving.\n\nReads consecutive elements and deinterleaves them into a single vector."]
+    #[doc = "Load elements from an array with 4-way interleaving.\n\nThis is different from loading a vector and calling `interleave`: `interleave` combines two already-loaded vectors, while this operation treats memory as four consecutive 128-bit blocks and transposes those blocks into one vector.\n\nFor example, with 32-bit lanes, memory laid out as `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]` loads as `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]`."]
     fn load_interleaved_128_u8x64(self, src: &[u8; 64usize]) -> u8x64<Self>;
-    #[doc = "Store elements to an array with 4-way interleaving.\n\nInterleaves the vector elements and writes them consecutively to memory."]
+    #[doc = "Store elements to an array with 4-way interleaving.\n\nThis is the inverse of `load_interleaved_128`. It is different from calling `interleave` and then storing: `interleave` combines two already-loaded vectors, while this operation transposes one vector into four consecutive 128-bit blocks in memory.\n\nFor example, with 32-bit lanes, a vector containing `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]` stores as `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]`."]
     fn store_interleaved_128_u8x64(self, a: u8x64<Self>, dest: &mut [u8; 64usize]) -> ();
     #[doc = "Reinterpret the bits of this vector as a vector of `u32` elements.\n\nThe total bit width is preserved; the number of elements changes accordingly."]
     fn reinterpret_u32_u8x64(self, a: u8x64<Self>) -> u32x16<Self>;
@@ -2323,9 +2323,9 @@ pub trait Simd:
     fn max_u16x32(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self>;
     #[doc = "Split a vector into two vectors of half the width.\n\nReturns a tuple of (lower half, upper half)."]
     fn split_u16x32(self, a: u16x32<Self>) -> (u16x16<Self>, u16x16<Self>);
-    #[doc = "Load elements from an array with 4-way interleaving.\n\nReads consecutive elements and deinterleaves them into a single vector."]
+    #[doc = "Load elements from an array with 4-way interleaving.\n\nThis is different from loading a vector and calling `interleave`: `interleave` combines two already-loaded vectors, while this operation treats memory as four consecutive 128-bit blocks and transposes those blocks into one vector.\n\nFor example, with 32-bit lanes, memory laid out as `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]` loads as `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]`."]
     fn load_interleaved_128_u16x32(self, src: &[u16; 32usize]) -> u16x32<Self>;
-    #[doc = "Store elements to an array with 4-way interleaving.\n\nInterleaves the vector elements and writes them consecutively to memory."]
+    #[doc = "Store elements to an array with 4-way interleaving.\n\nThis is the inverse of `load_interleaved_128`. It is different from calling `interleave` and then storing: `interleave` combines two already-loaded vectors, while this operation transposes one vector into four consecutive 128-bit blocks in memory.\n\nFor example, with 32-bit lanes, a vector containing `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]` stores as `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]`."]
     fn store_interleaved_128_u16x32(self, a: u16x32<Self>, dest: &mut [u16; 32usize]) -> ();
     #[doc = "Truncate each element to a narrower integer type.\n\nThe number of elements in the result is twice that of the input."]
     fn narrow_u16x32(self, a: u16x32<Self>) -> u8x32<Self>;
@@ -2534,9 +2534,9 @@ pub trait Simd:
     fn max_u32x16(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self>;
     #[doc = "Split a vector into two vectors of half the width.\n\nReturns a tuple of (lower half, upper half)."]
     fn split_u32x16(self, a: u32x16<Self>) -> (u32x8<Self>, u32x8<Self>);
-    #[doc = "Load elements from an array with 4-way interleaving.\n\nReads consecutive elements and deinterleaves them into a single vector."]
+    #[doc = "Load elements from an array with 4-way interleaving.\n\nThis is different from loading a vector and calling `interleave`: `interleave` combines two already-loaded vectors, while this operation treats memory as four consecutive 128-bit blocks and transposes those blocks into one vector.\n\nFor example, with 32-bit lanes, memory laid out as `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]` loads as `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]`."]
     fn load_interleaved_128_u32x16(self, src: &[u32; 16usize]) -> u32x16<Self>;
-    #[doc = "Store elements to an array with 4-way interleaving.\n\nInterleaves the vector elements and writes them consecutively to memory."]
+    #[doc = "Store elements to an array with 4-way interleaving.\n\nThis is the inverse of `load_interleaved_128`. It is different from calling `interleave` and then storing: `interleave` combines two already-loaded vectors, while this operation transposes one vector into four consecutive 128-bit blocks in memory.\n\nFor example, with 32-bit lanes, a vector containing `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]` stores as `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]`."]
     fn store_interleaved_128_u32x16(self, a: u32x16<Self>, dest: &mut [u32; 16usize]) -> ();
     #[doc = "Reinterpret the bits of this vector as a vector of `u8` elements.\n\nThe total bit width is preserved; the number of elements changes accordingly."]
     fn reinterpret_u8_u32x16(self, a: u32x16<Self>) -> u8x64<Self>;
diff --git a/fearless_simd_gen/src/ops.rs b/fearless_simd_gen/src/ops.rs
index 2cd2659e..5bc18941 100644
--- a/fearless_simd_gen/src/ops.rs
+++ b/fearless_simd_gen/src/ops.rs
@@ -1353,7 +1353,13 @@ pub(crate) fn ops_for_type(ty: &VecType) -> Vec<Op> {
                 block_size: 128,
                 block_count: 4,
             },
-            "Load elements from an array with 4-way interleaving.\n\nReads consecutive elements and deinterleaves them into a single vector.",
+            "Load elements from an array with 4-way interleaving.\n\n\
+            This is different from loading a vector and calling `interleave`: `interleave` combines two already-loaded \
+            vectors, while this operation treats memory as four consecutive 128-bit blocks and transposes those blocks \
+            into one vector.\n\n\
+            For example, with 32-bit lanes, memory laid out as \
+            `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]` loads as \
+            `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]`.",
         ));
     }
 
@@ -1365,7 +1371,13 @@ pub(crate) fn ops_for_type(ty: &VecType) -> Vec<Op> {
                 block_size: 128,
                 block_count: 4,
             },
-            "Store elements to an array with 4-way interleaving.\n\nInterleaves the vector elements and writes them consecutively to memory.",
+            "Store elements to an array with 4-way interleaving.\n\n\
+            This is the inverse of `load_interleaved_128`. It is different from calling `interleave` and then storing: \
+            `interleave` combines two already-loaded vectors, while this operation transposes one vector into four \
+            consecutive 128-bit blocks in memory.\n\n\
+            For example, with 32-bit lanes, a vector containing \
+            `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]` stores as \
+            `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]`.",
         ));
     }