linebender · Shnatsel · Jun 17, 2026 · Jun 16, 2026 · Jun 16, 2026 · Jun 16, 2026
diff --git a/fearless_simd/src/generated/avx2.rs b/fearless_simd/src/generated/avx2.rs
diff --git a/fearless_simd/src/generated/simd_trait.rs b/fearless_simd/src/generated/simd_trait.rs
diff --git a/fearless_simd/src/generated/sse4_2.rs b/fearless_simd/src/generated/sse4_2.rs
diff --git a/fearless_simd/src/generated/wasm.rs b/fearless_simd/src/generated/wasm.rs
@@ -5350,12 +5350,13 @@ impl Simd for WasmSimd128 {
         let out1 = u32x4_shuffle::<2, 6, 3, 7>(v02_lower, v13_lower);
         let out2 = u32x4_shuffle::<0, 4, 1, 5>(v02_upper, v13_upper);
         let out3 = u32x4_shuffle::<2, 6, 3, 7>(v02_upper, v13_upper);
-        unsafe {
-            v128_store(dest[0 * 4usize..].as_mut_ptr() as *mut v128, out0);
-            v128_store(dest[1 * 4usize..].as_mut_ptr() as *mut v128, out1);
-            v128_store(dest[2 * 4usize..].as_mut_ptr() as *mut v128, out2);
-            v128_store(dest[3 * 4usize..].as_mut_ptr() as *mut v128, out3);
-        }
+        let (chunks, []) = dest.as_chunks_mut::<4usize>() else {
+            unreachable!()
+        };
+        crate::transmute::checked_transmute_store::<v128, [f32; 4usize]>(out0, &mut chunks[0]);
+        crate::transmute::checked_transmute_store::<v128, [f32; 4usize]>(out1, &mut chunks[1]);
+        crate::transmute::checked_transmute_store::<v128, [f32; 4usize]>(out2, &mut chunks[2]);
+        crate::transmute::checked_transmute_store::<v128, [f32; 4usize]>(out3, &mut chunks[3]);
     }
     #[inline(always)]
     fn reinterpret_u8_f32x16(self, a: f32x16<Self>) -> u8x64<Self> {
@@ -5980,12 +5981,13 @@ impl Simd for WasmSimd128 {
         let out3 = u8x16_shuffle::<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(
             v02_upper, v13_upper,
         );
-        unsafe {
-            v128_store(dest[0 * 16usize..].as_mut_ptr() as *mut v128, out0);
-            v128_store(dest[1 * 16usize..].as_mut_ptr() as *mut v128, out1);
-            v128_store(dest[2 * 16usize..].as_mut_ptr() as *mut v128, out2);
-            v128_store(dest[3 * 16usize..].as_mut_ptr() as *mut v128, out3);
-        }
+        let (chunks, []) = dest.as_chunks_mut::<16usize>() else {
+            unreachable!()
+        };
+        crate::transmute::checked_transmute_store::<v128, [u8; 16usize]>(out0, &mut chunks[0]);
+        crate::transmute::checked_transmute_store::<v128, [u8; 16usize]>(out1, &mut chunks[1]);
+        crate::transmute::checked_transmute_store::<v128, [u8; 16usize]>(out2, &mut chunks[2]);
+        crate::transmute::checked_transmute_store::<v128, [u8; 16usize]>(out3, &mut chunks[3]);
     }
     #[inline(always)]
     fn reinterpret_u32_u8x64(self, a: u8x64<Self>) -> u32x16<Self> {
@@ -6676,12 +6678,13 @@ impl Simd for WasmSimd128 {
         let out1 = u16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(v02_lower, v13_lower);
         let out2 = u16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(v02_upper, v13_upper);
         let out3 = u16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(v02_upper, v13_upper);
-        unsafe {
-            v128_store(dest[0 * 8usize..].as_mut_ptr() as *mut v128, out0);
-            v128_store(dest[1 * 8usize..].as_mut_ptr() as *mut v128, out1);
-            v128_store(dest[2 * 8usize..].as_mut_ptr() as *mut v128, out2);
-            v128_store(dest[3 * 8usize..].as_mut_ptr() as *mut v128, out3);
-        }
+        let (chunks, []) = dest.as_chunks_mut::<8usize>() else {
+            unreachable!()
+        };
+        crate::transmute::checked_transmute_store::<v128, [u16; 8usize]>(out0, &mut chunks[0]);
+        crate::transmute::checked_transmute_store::<v128, [u16; 8usize]>(out1, &mut chunks[1]);
+        crate::transmute::checked_transmute_store::<v128, [u16; 8usize]>(out2, &mut chunks[2]);
+        crate::transmute::checked_transmute_store::<v128, [u16; 8usize]>(out3, &mut chunks[3]);
     }
     #[inline(always)]
     fn narrow_u16x32(self, a: u16x32<Self>) -> u8x32<Self> {
@@ -7378,12 +7381,13 @@ impl Simd for WasmSimd128 {
         let out1 = u32x4_shuffle::<2, 6, 3, 7>(v02_lower, v13_lower);
         let out2 = u32x4_shuffle::<0, 4, 1, 5>(v02_upper, v13_upper);
         let out3 = u32x4_shuffle::<2, 6, 3, 7>(v02_upper, v13_upper);
-        unsafe {
-            v128_store(dest[0 * 4usize..].as_mut_ptr() as *mut v128, out0);
-            v128_store(dest[1 * 4usize..].as_mut_ptr() as *mut v128, out1);
-            v128_store(dest[2 * 4usize..].as_mut_ptr() as *mut v128, out2);
-            v128_store(dest[3 * 4usize..].as_mut_ptr() as *mut v128, out3);
-        }
+        let (chunks, []) = dest.as_chunks_mut::<4usize>() else {
+            unreachable!()
+        };
+        crate::transmute::checked_transmute_store::<v128, [u32; 4usize]>(out0, &mut chunks[0]);
+        crate::transmute::checked_transmute_store::<v128, [u32; 4usize]>(out1, &mut chunks[1]);
+        crate::transmute::checked_transmute_store::<v128, [u32; 4usize]>(out2, &mut chunks[2]);
+        crate::transmute::checked_transmute_store::<v128, [u32; 4usize]>(out3, &mut chunks[3]);
     }
     #[inline(always)]
     fn reinterpret_u8_u32x16(self, a: u32x16<Self>) -> u8x64<Self> {

diff --git a/fearless_simd_gen/src/mk_wasm.rs b/fearless_simd_gen/src/mk_wasm.rs
@@ -683,6 +683,7 @@ impl Level for WasmSimd128 {
             } => {
                 assert_eq!(block_count, 4, "only count of 4 is currently supported");
                 let elems_per_vec = block_size as usize / vec_ty.scalar_bits;
+                let scalar_ty = vec_ty.scalar.rust(vec_ty.scalar_bits);
 
                 let (lower_indices, upper_indices, shuffle_fn) = match vec_ty.scalar_bits {
                     8 => (
@@ -741,12 +742,14 @@ impl Level for WasmSimd128 {
                         let out2 = #shuffle_fn::<#lower_indices>(v02_upper, v13_upper);
                         let out3 = #shuffle_fn::<#upper_indices>(v02_upper, v13_upper);
 
-                        unsafe {
-                            v128_store(dest[0 * #elems_per_vec..].as_mut_ptr() as *mut v128, out0);
-                            v128_store(dest[1 * #elems_per_vec..].as_mut_ptr() as *mut v128, out1);
-                            v128_store(dest[2 * #elems_per_vec..].as_mut_ptr() as *mut v128, out2);
-                            v128_store(dest[3 * #elems_per_vec..].as_mut_ptr() as *mut v128, out3);
-                        }
+                        let (chunks, []) = dest.as_chunks_mut::<#elems_per_vec>() else {
+                            unreachable!()
+                        };
+
+                        crate::transmute::checked_transmute_store::<v128, [#scalar_ty; #elems_per_vec]>(out0, &mut chunks[0]);
+                        crate::transmute::checked_transmute_store::<v128, [#scalar_ty; #elems_per_vec]>(out1, &mut chunks[1]);
+                        crate::transmute::checked_transmute_store::<v128, [#scalar_ty; #elems_per_vec]>(out2, &mut chunks[2]);
+                        crate::transmute::checked_transmute_store::<v128, [#scalar_ty; #elems_per_vec]>(out3, &mut chunks[3]);
                     }
                 }
             }

diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs
@@ -219,11 +219,11 @@ impl Level for X86 {
             OpSig::LoadInterleaved {
                 block_size,
                 block_count,
-            } => self.handle_load_interleaved(method_sig, vec_ty, block_size, block_count),
+            } => self.handle_load_interleaved(op, vec_ty, block_size, block_count),
             OpSig::StoreInterleaved {
                 block_size,
                 block_count,
-            } => self.handle_store_interleaved(method_sig, vec_ty, block_size, block_count),
+            } => self.handle_store_interleaved(op, vec_ty, block_size, block_count),
             OpSig::FromArray { kind } => generic_from_array(method_sig, vec_ty, kind),
             OpSig::AsArray { kind } => {
                 generic_as_array(method_sig, vec_ty, kind, self.max_block_size(), |vec_ty| {
@@ -1779,7 +1779,7 @@ impl X86 {
 
     pub(crate) fn handle_load_interleaved(
         &self,
-        method_sig: TokenStream,
+        op: Op,
         vec_ty: &VecType,
         block_size: u16,
         block_count: u16,
@@ -1789,7 +1789,7 @@ impl X86 {
             "only 128-bit blocks are currently supported"
         );
         assert_eq!(block_count, 4, "only count of 4 is currently supported");
-        let expr = match vec_ty.scalar_bits {
+        match vec_ty.scalar_bits {
             32 | 16 | 8 => {
                 let block_ty =
                     VecType::new(vec_ty.scalar, vec_ty.scalar_bits, 128 / vec_ty.scalar_bits);
@@ -1874,24 +1874,24 @@ impl X86 {
                     }
                 };
 
-                quote! {
-                    let (chunks, []) = src.as_chunks::<#block_len>() else {
-                        unreachable!()
-                    };
-                    let v0: #native_ty = crate::transmute::checked_transmute_copy::<[#scalar_ty; #block_len], #native_ty>(
-                        &chunks[0],
-                    );
-                    let v1: #native_ty = crate::transmute::checked_transmute_copy::<[#scalar_ty; #block_len], #native_ty>(
-                        &chunks[1],
-                    );
-                    let v2: #native_ty = crate::transmute::checked_transmute_copy::<[#scalar_ty; #block_len], #native_ty>(
-                        &chunks[2],
-                    );
-                    let v3: #native_ty = crate::transmute::checked_transmute_copy::<[#scalar_ty; #block_len], #native_ty>(
-                        &chunks[3],
-                    );
+                self.kernel_method(op, vec_ty, |token| {
+                    quote! {
+                        let (chunks, []) = src.as_chunks::<#block_len>() else {
+                            unreachable!()
+                        };
+                        let v0: #native_ty = crate::transmute::checked_transmute_copy::<[#scalar_ty; #block_len], #native_ty>(
+                            &chunks[0],
+                        );
+                        let v1: #native_ty = crate::transmute::checked_transmute_copy::<[#scalar_ty; #block_len], #native_ty>(
+                            &chunks[1],
+                        );
+                        let v2: #native_ty = crate::transmute::checked_transmute_copy::<[#scalar_ty; #block_len], #native_ty>(
+                            &chunks[2],
+                        );
+                        let v3: #native_ty = crate::transmute::checked_transmute_copy::<[#scalar_ty; #block_len], #native_ty>(
+                            &chunks[3],
+                        );
 
-                    unsafe {
                         #init_shuffle
 
                         let tmp0 = #unpacklo_32(v0, v1); // [0,4,1,5]
@@ -1901,26 +1901,20 @@ impl X86 {
 
                         #final_unpack
 
-                        self.#combine_full(
-                            self.#combine_half(out0.simd_into(self), out1.simd_into(self)),
-                            self.#combine_half(out2.simd_into(self), out3.simd_into(self)),
+                        #token.#combine_full(
+                            #token.#combine_half(out0.simd_into(#token), out1.simd_into(#token)),
+                            #token.#combine_half(out2.simd_into(#token), out3.simd_into(#token)),
                         )
                     }
-                }
+                })
             }
             _ => unimplemented!(),
-        };
-
-        quote! {
-            #method_sig {
-                #expr
-            }
         }
     }
 
     pub(crate) fn handle_store_interleaved(
         &self,
-        method_sig: TokenStream,
+        op: Op,
         vec_ty: &VecType,
         block_size: u16,
         block_count: u16,
@@ -1930,12 +1924,12 @@ impl X86 {
             "only 128-bit blocks are currently supported"
         );
         assert_eq!(block_count, 4, "only count of 4 is currently supported");
-        let expr = match vec_ty.scalar_bits {
+        match vec_ty.scalar_bits {
             32 | 16 | 8 => {
                 let block_ty =
                     VecType::new(vec_ty.scalar, vec_ty.scalar_bits, 128 / vec_ty.scalar_bits);
-                let store_unaligned =
-                    intrinsic_ident("storeu", coarse_type(&block_ty), block_ty.n_bits());
+                let scalar_ty = block_ty.scalar.rust(block_ty.scalar_bits);
+                let native_ty = self.arch_ty(&block_ty);
                 let vec_32 = block_ty.reinterpret(block_ty.scalar, 32);
                 let unpacklo_32 = simple_sign_unaware_intrinsic("unpacklo", &vec_32);
                 let unpackhi_32 = simple_sign_unaware_intrinsic("unpackhi", &vec_32);
@@ -2013,16 +2007,16 @@ impl X86 {
                     }
                 };
 
-                quote! {
-                    let (v01, v23) = self.#split_full(a);
-                    let (v0, v1) = self.#split_half(v01);
-                    let (v2, v3) = self.#split_half(v23);
-                    let v0 = v0.into();
-                    let v1 = v1.into();
-                    let v2 = v2.into();
-                    let v3 = v3.into();
-
-                    unsafe {
+                self.kernel_method(op, vec_ty, |token| {
+                    quote! {
+                        let (v01, v23) = #token.#split_full(a);
+                        let (v0, v1) = #token.#split_half(v01);
+                        let (v2, v3) = #token.#split_half(v23);
+                        let v0 = v0.into();
+                        let v1 = v1.into();
+                        let v2 = v2.into();
+                        let v3 = v3.into();
+
                         let tmp0 = #unpacklo_32(v0, v1); // [0,4,1,5]
                         let tmp1 = #unpackhi_32(v0, v1); // [2,6,3,7]
                         let tmp2 = #unpacklo_32(v2, v3); // [8,12,9,13]
@@ -2032,20 +2026,18 @@ impl X86 {
 
                         #post_shuffle
 
-                        #store_unaligned(dest.as_mut_ptr() as *mut _, out0);
-                        #store_unaligned(dest.as_mut_ptr().add(#block_len) as *mut _, out1);
-                        #store_unaligned(dest.as_mut_ptr().add(2 * #block_len) as *mut _, out2);
-                        #store_unaligned(dest.as_mut_ptr().add(3 * #block_len) as *mut _, out3);
+                        let (chunks, []) = dest.as_chunks_mut::<#block_len>() else {
+                            unreachable!()
+                        };
+
+                        crate::transmute::checked_transmute_store::<#native_ty, [#scalar_ty; #block_len]>(out0, &mut chunks[0]);
+                        crate::transmute::checked_transmute_store::<#native_ty, [#scalar_ty; #block_len]>(out1, &mut chunks[1]);
+                        crate::transmute::checked_transmute_store::<#native_ty, [#scalar_ty; #block_len]>(out2, &mut chunks[2]);
+                        crate::transmute::checked_transmute_store::<#native_ty, [#scalar_ty; #block_len]>(out3, &mut chunks[3]);
                     }
-                }
+                })
             }
             _ => unimplemented!(),
-        };
-
-        quote! {
-            #method_sig {
-                #expr
-            }
         }
     }
 

diff --git a/fearless_simd_gen/src/ops.rs b/fearless_simd_gen/src/ops.rs
@@ -227,6 +227,12 @@ impl Op {
         let arg_decls = sig.arg_decls();
         let call_args = &sig.arg_names;
         let ret = &sig.ret;
+        let kernel_call = if matches!(self.sig, OpSig::StoreInterleaved { .. } | OpSig::StoreArray)
+        {
+            quote! { kernel(self #(, #call_args)*); }
+        } else {
+            quote! { kernel(self #(, #call_args)*) }
+        };
 
         quote! {
             #method_sig {
@@ -237,7 +243,7 @@ impl Op {
                     }
                 );
 
-                kernel(self #(, #call_args)*)
+                #kernel_call
             }
         }
     }
@@ -641,7 +647,7 @@ const FLOAT_OPS: &[Op] = &[
         "Compute an approximate reciprocal (`1. / x`) for each element.\n\n\
          This uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\n\
          On x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. \
-         On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. \
+         On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. \
          The precision of this operation may change as new platform support is added.",
     ),
     Op::new(
@@ -1347,7 +1353,13 @@ pub(crate) fn ops_for_type(ty: &VecType) -> Vec<Op> {
                 block_size: 128,
                 block_count: 4,
             },
-            "Load elements from an array with 4-way interleaving.\n\nReads consecutive elements and deinterleaves them into a single vector.",
+            "Load elements from an array with 4-way interleaving.\n\n\
+            This is different from loading a vector and calling `interleave`: `interleave` combines two already-loaded \
+            vectors, while this operation treats memory as four consecutive 128-bit blocks and transposes those blocks \
+            into one vector.\n\n\
+            For example, with 32-bit lanes, memory laid out as \
+            `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]` loads as \
+            `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]`.",
         ));
     }
 
@@ -1359,7 +1371,13 @@ pub(crate) fn ops_for_type(ty: &VecType) -> Vec<Op> {
                 block_size: 128,
                 block_count: 4,
             },
-            "Store elements to an array with 4-way interleaving.\n\nInterleaves the vector elements and writes them consecutively to memory.",
+            "Store elements to an array with 4-way interleaving.\n\n\
+            This is the inverse of `load_interleaved_128`. It is different from calling `interleave` and then storing: \
+            `interleave` combines two already-loaded vectors, while this operation transposes one vector into four \
+            consecutive 128-bit blocks in memory.\n\n\
+            For example, with 32-bit lanes, a vector containing \
+            `[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]` stores as \
+            `[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]`.",
         ));
     }
 

diff --git a/fearless_simd_tests/tests/harness/mod.rs b/fearless_simd_tests/tests/harness/mod.rs
@@ -816,6 +816,50 @@ fn all_false_mask8x16<S: Simd>(simd: S) {
     assert!(!simd.all_false_mask8x16(one_neg));
 }
 
+#[simd_test]
+fn load_interleaved_128_f32x16<S: Simd>(simd: S) {
+    let data = [
+        0.0,
+        4.0,
+        8.0,
+        f32::MIN,
+        f32::NAN,
+        -0.0,
+        9.0,
+        13.0,
+        f32::INFINITY,
+        6.0,
+        -10.0,
+        f32::MAX,
+        -3.0,
+        f32::NEG_INFINITY,
+        11.0,
+        15.0,
+    ];
+    let expected = [
+        0.0,
+        f32::NAN,
+        f32::INFINITY,
+        -3.0,
+        4.0,
+        -0.0,
+        6.0,
+        f32::NEG_INFINITY,
+        8.0,
+        9.0,
+        -10.0,
+        11.0,
+        f32::MIN,
+        13.0,
+        f32::MAX,
+        15.0,
+    ];
+
+    // Note: f32::NAN != f32::NAN hence we compare the bit pattern.
+    let result = simd.load_interleaved_128_f32x16(&data);
+    assert_eq!((*result).map(f32::to_bits), expected.map(f32::to_bits),);
+}
+
 #[simd_test]
 fn load_interleaved_128_u32x16<S: Simd>(simd: S) {
     #[rustfmt::skip]