Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
500 changes: 296 additions & 204 deletions fearless_simd/src/generated/avx2.rs

Large diffs are not rendered by default.

30 changes: 15 additions & 15 deletions fearless_simd/src/generated/simd_trait.rs

Large diffs are not rendered by default.

500 changes: 296 additions & 204 deletions fearless_simd/src/generated/sse4_2.rs

Large diffs are not rendered by default.

52 changes: 28 additions & 24 deletions fearless_simd/src/generated/wasm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5350,12 +5350,13 @@ impl Simd for WasmSimd128 {
let out1 = u32x4_shuffle::<2, 6, 3, 7>(v02_lower, v13_lower);
let out2 = u32x4_shuffle::<0, 4, 1, 5>(v02_upper, v13_upper);
let out3 = u32x4_shuffle::<2, 6, 3, 7>(v02_upper, v13_upper);
unsafe {
v128_store(dest[0 * 4usize..].as_mut_ptr() as *mut v128, out0);
v128_store(dest[1 * 4usize..].as_mut_ptr() as *mut v128, out1);
v128_store(dest[2 * 4usize..].as_mut_ptr() as *mut v128, out2);
v128_store(dest[3 * 4usize..].as_mut_ptr() as *mut v128, out3);
}
let (chunks, []) = dest.as_chunks_mut::<4usize>() else {
unreachable!()
};
crate::transmute::checked_transmute_store::<v128, [f32; 4usize]>(out0, &mut chunks[0]);
crate::transmute::checked_transmute_store::<v128, [f32; 4usize]>(out1, &mut chunks[1]);
crate::transmute::checked_transmute_store::<v128, [f32; 4usize]>(out2, &mut chunks[2]);
crate::transmute::checked_transmute_store::<v128, [f32; 4usize]>(out3, &mut chunks[3]);
}
#[inline(always)]
fn reinterpret_u8_f32x16(self, a: f32x16<Self>) -> u8x64<Self> {
Expand Down Expand Up @@ -5980,12 +5981,13 @@ impl Simd for WasmSimd128 {
let out3 = u8x16_shuffle::<8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31>(
v02_upper, v13_upper,
);
unsafe {
v128_store(dest[0 * 16usize..].as_mut_ptr() as *mut v128, out0);
v128_store(dest[1 * 16usize..].as_mut_ptr() as *mut v128, out1);
v128_store(dest[2 * 16usize..].as_mut_ptr() as *mut v128, out2);
v128_store(dest[3 * 16usize..].as_mut_ptr() as *mut v128, out3);
}
let (chunks, []) = dest.as_chunks_mut::<16usize>() else {
unreachable!()
};
crate::transmute::checked_transmute_store::<v128, [u8; 16usize]>(out0, &mut chunks[0]);
crate::transmute::checked_transmute_store::<v128, [u8; 16usize]>(out1, &mut chunks[1]);
crate::transmute::checked_transmute_store::<v128, [u8; 16usize]>(out2, &mut chunks[2]);
crate::transmute::checked_transmute_store::<v128, [u8; 16usize]>(out3, &mut chunks[3]);
}
#[inline(always)]
fn reinterpret_u32_u8x64(self, a: u8x64<Self>) -> u32x16<Self> {
Expand Down Expand Up @@ -6676,12 +6678,13 @@ impl Simd for WasmSimd128 {
let out1 = u16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(v02_lower, v13_lower);
let out2 = u16x8_shuffle::<0, 8, 1, 9, 2, 10, 3, 11>(v02_upper, v13_upper);
let out3 = u16x8_shuffle::<4, 12, 5, 13, 6, 14, 7, 15>(v02_upper, v13_upper);
unsafe {
v128_store(dest[0 * 8usize..].as_mut_ptr() as *mut v128, out0);
v128_store(dest[1 * 8usize..].as_mut_ptr() as *mut v128, out1);
v128_store(dest[2 * 8usize..].as_mut_ptr() as *mut v128, out2);
v128_store(dest[3 * 8usize..].as_mut_ptr() as *mut v128, out3);
}
let (chunks, []) = dest.as_chunks_mut::<8usize>() else {
unreachable!()
};
crate::transmute::checked_transmute_store::<v128, [u16; 8usize]>(out0, &mut chunks[0]);
crate::transmute::checked_transmute_store::<v128, [u16; 8usize]>(out1, &mut chunks[1]);
crate::transmute::checked_transmute_store::<v128, [u16; 8usize]>(out2, &mut chunks[2]);
crate::transmute::checked_transmute_store::<v128, [u16; 8usize]>(out3, &mut chunks[3]);
}
#[inline(always)]
fn narrow_u16x32(self, a: u16x32<Self>) -> u8x32<Self> {
Expand Down Expand Up @@ -7378,12 +7381,13 @@ impl Simd for WasmSimd128 {
let out1 = u32x4_shuffle::<2, 6, 3, 7>(v02_lower, v13_lower);
let out2 = u32x4_shuffle::<0, 4, 1, 5>(v02_upper, v13_upper);
let out3 = u32x4_shuffle::<2, 6, 3, 7>(v02_upper, v13_upper);
unsafe {
v128_store(dest[0 * 4usize..].as_mut_ptr() as *mut v128, out0);
v128_store(dest[1 * 4usize..].as_mut_ptr() as *mut v128, out1);
v128_store(dest[2 * 4usize..].as_mut_ptr() as *mut v128, out2);
v128_store(dest[3 * 4usize..].as_mut_ptr() as *mut v128, out3);
}
let (chunks, []) = dest.as_chunks_mut::<4usize>() else {
unreachable!()
};
crate::transmute::checked_transmute_store::<v128, [u32; 4usize]>(out0, &mut chunks[0]);
crate::transmute::checked_transmute_store::<v128, [u32; 4usize]>(out1, &mut chunks[1]);
crate::transmute::checked_transmute_store::<v128, [u32; 4usize]>(out2, &mut chunks[2]);
crate::transmute::checked_transmute_store::<v128, [u32; 4usize]>(out3, &mut chunks[3]);
}
#[inline(always)]
fn reinterpret_u8_u32x16(self, a: u32x16<Self>) -> u8x64<Self> {
Expand Down
15 changes: 9 additions & 6 deletions fearless_simd_gen/src/mk_wasm.rs
Original file line number Diff line number Diff line change
Expand Up @@ -683,6 +683,7 @@ impl Level for WasmSimd128 {
} => {
assert_eq!(block_count, 4, "only count of 4 is currently supported");
let elems_per_vec = block_size as usize / vec_ty.scalar_bits;
let scalar_ty = vec_ty.scalar.rust(vec_ty.scalar_bits);

let (lower_indices, upper_indices, shuffle_fn) = match vec_ty.scalar_bits {
8 => (
Expand Down Expand Up @@ -741,12 +742,14 @@ impl Level for WasmSimd128 {
let out2 = #shuffle_fn::<#lower_indices>(v02_upper, v13_upper);
let out3 = #shuffle_fn::<#upper_indices>(v02_upper, v13_upper);

unsafe {
v128_store(dest[0 * #elems_per_vec..].as_mut_ptr() as *mut v128, out0);
v128_store(dest[1 * #elems_per_vec..].as_mut_ptr() as *mut v128, out1);
v128_store(dest[2 * #elems_per_vec..].as_mut_ptr() as *mut v128, out2);
v128_store(dest[3 * #elems_per_vec..].as_mut_ptr() as *mut v128, out3);
}
let (chunks, []) = dest.as_chunks_mut::<#elems_per_vec>() else {
unreachable!()
};

crate::transmute::checked_transmute_store::<v128, [#scalar_ty; #elems_per_vec]>(out0, &mut chunks[0]);
crate::transmute::checked_transmute_store::<v128, [#scalar_ty; #elems_per_vec]>(out1, &mut chunks[1]);
crate::transmute::checked_transmute_store::<v128, [#scalar_ty; #elems_per_vec]>(out2, &mut chunks[2]);
crate::transmute::checked_transmute_store::<v128, [#scalar_ty; #elems_per_vec]>(out3, &mut chunks[3]);
}
}
}
Expand Down
104 changes: 48 additions & 56 deletions fearless_simd_gen/src/mk_x86.rs
Original file line number Diff line number Diff line change
Expand Up @@ -219,11 +219,11 @@ impl Level for X86 {
OpSig::LoadInterleaved {
block_size,
block_count,
} => self.handle_load_interleaved(method_sig, vec_ty, block_size, block_count),
} => self.handle_load_interleaved(op, vec_ty, block_size, block_count),
OpSig::StoreInterleaved {
block_size,
block_count,
} => self.handle_store_interleaved(method_sig, vec_ty, block_size, block_count),
} => self.handle_store_interleaved(op, vec_ty, block_size, block_count),
OpSig::FromArray { kind } => generic_from_array(method_sig, vec_ty, kind),
OpSig::AsArray { kind } => {
generic_as_array(method_sig, vec_ty, kind, self.max_block_size(), |vec_ty| {
Expand Down Expand Up @@ -1779,7 +1779,7 @@ impl X86 {

pub(crate) fn handle_load_interleaved(
&self,
method_sig: TokenStream,
op: Op,
vec_ty: &VecType,
block_size: u16,
block_count: u16,
Expand All @@ -1789,7 +1789,7 @@ impl X86 {
"only 128-bit blocks are currently supported"
);
assert_eq!(block_count, 4, "only count of 4 is currently supported");
let expr = match vec_ty.scalar_bits {
match vec_ty.scalar_bits {
32 | 16 | 8 => {
let block_ty =
VecType::new(vec_ty.scalar, vec_ty.scalar_bits, 128 / vec_ty.scalar_bits);
Expand Down Expand Up @@ -1874,24 +1874,24 @@ impl X86 {
}
};

quote! {
let (chunks, []) = src.as_chunks::<#block_len>() else {
unreachable!()
};
let v0: #native_ty = crate::transmute::checked_transmute_copy::<[#scalar_ty; #block_len], #native_ty>(
&chunks[0],
);
let v1: #native_ty = crate::transmute::checked_transmute_copy::<[#scalar_ty; #block_len], #native_ty>(
&chunks[1],
);
let v2: #native_ty = crate::transmute::checked_transmute_copy::<[#scalar_ty; #block_len], #native_ty>(
&chunks[2],
);
let v3: #native_ty = crate::transmute::checked_transmute_copy::<[#scalar_ty; #block_len], #native_ty>(
&chunks[3],
);
self.kernel_method(op, vec_ty, |token| {
quote! {
let (chunks, []) = src.as_chunks::<#block_len>() else {
unreachable!()
};
let v0: #native_ty = crate::transmute::checked_transmute_copy::<[#scalar_ty; #block_len], #native_ty>(
&chunks[0],
);
let v1: #native_ty = crate::transmute::checked_transmute_copy::<[#scalar_ty; #block_len], #native_ty>(
&chunks[1],
);
let v2: #native_ty = crate::transmute::checked_transmute_copy::<[#scalar_ty; #block_len], #native_ty>(
&chunks[2],
);
let v3: #native_ty = crate::transmute::checked_transmute_copy::<[#scalar_ty; #block_len], #native_ty>(
&chunks[3],
);

unsafe {
#init_shuffle

let tmp0 = #unpacklo_32(v0, v1); // [0,4,1,5]
Expand All @@ -1901,26 +1901,20 @@ impl X86 {

#final_unpack

self.#combine_full(
self.#combine_half(out0.simd_into(self), out1.simd_into(self)),
self.#combine_half(out2.simd_into(self), out3.simd_into(self)),
#token.#combine_full(
#token.#combine_half(out0.simd_into(#token), out1.simd_into(#token)),
#token.#combine_half(out2.simd_into(#token), out3.simd_into(#token)),
)
}
}
})
}
_ => unimplemented!(),
};

quote! {
#method_sig {
#expr
}
}
}

pub(crate) fn handle_store_interleaved(
&self,
method_sig: TokenStream,
op: Op,
vec_ty: &VecType,
block_size: u16,
block_count: u16,
Expand All @@ -1930,12 +1924,12 @@ impl X86 {
"only 128-bit blocks are currently supported"
);
assert_eq!(block_count, 4, "only count of 4 is currently supported");
let expr = match vec_ty.scalar_bits {
match vec_ty.scalar_bits {
32 | 16 | 8 => {
let block_ty =
VecType::new(vec_ty.scalar, vec_ty.scalar_bits, 128 / vec_ty.scalar_bits);
let store_unaligned =
intrinsic_ident("storeu", coarse_type(&block_ty), block_ty.n_bits());
let scalar_ty = block_ty.scalar.rust(block_ty.scalar_bits);
let native_ty = self.arch_ty(&block_ty);
let vec_32 = block_ty.reinterpret(block_ty.scalar, 32);
let unpacklo_32 = simple_sign_unaware_intrinsic("unpacklo", &vec_32);
let unpackhi_32 = simple_sign_unaware_intrinsic("unpackhi", &vec_32);
Expand Down Expand Up @@ -2013,16 +2007,16 @@ impl X86 {
}
};

quote! {
let (v01, v23) = self.#split_full(a);
let (v0, v1) = self.#split_half(v01);
let (v2, v3) = self.#split_half(v23);
let v0 = v0.into();
let v1 = v1.into();
let v2 = v2.into();
let v3 = v3.into();

unsafe {
self.kernel_method(op, vec_ty, |token| {
quote! {
let (v01, v23) = #token.#split_full(a);
let (v0, v1) = #token.#split_half(v01);
let (v2, v3) = #token.#split_half(v23);
let v0 = v0.into();
let v1 = v1.into();
let v2 = v2.into();
let v3 = v3.into();

let tmp0 = #unpacklo_32(v0, v1); // [0,4,1,5]
let tmp1 = #unpackhi_32(v0, v1); // [2,6,3,7]
let tmp2 = #unpacklo_32(v2, v3); // [8,12,9,13]
Expand All @@ -2032,20 +2026,18 @@ impl X86 {

#post_shuffle

#store_unaligned(dest.as_mut_ptr() as *mut _, out0);
#store_unaligned(dest.as_mut_ptr().add(#block_len) as *mut _, out1);
#store_unaligned(dest.as_mut_ptr().add(2 * #block_len) as *mut _, out2);
#store_unaligned(dest.as_mut_ptr().add(3 * #block_len) as *mut _, out3);
let (chunks, []) = dest.as_chunks_mut::<#block_len>() else {
unreachable!()
};

crate::transmute::checked_transmute_store::<#native_ty, [#scalar_ty; #block_len]>(out0, &mut chunks[0]);
crate::transmute::checked_transmute_store::<#native_ty, [#scalar_ty; #block_len]>(out1, &mut chunks[1]);
crate::transmute::checked_transmute_store::<#native_ty, [#scalar_ty; #block_len]>(out2, &mut chunks[2]);
crate::transmute::checked_transmute_store::<#native_ty, [#scalar_ty; #block_len]>(out3, &mut chunks[3]);
}
}
})
}
_ => unimplemented!(),
};

quote! {
#method_sig {
#expr
}
}
}

Expand Down
26 changes: 22 additions & 4 deletions fearless_simd_gen/src/ops.rs
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,12 @@ impl Op {
let arg_decls = sig.arg_decls();
let call_args = &sig.arg_names;
let ret = &sig.ret;
let kernel_call = if matches!(self.sig, OpSig::StoreInterleaved { .. } | OpSig::StoreArray)
{
quote! { kernel(self #(, #call_args)*); }
} else {
quote! { kernel(self #(, #call_args)*) }
};

quote! {
#method_sig {
Expand All @@ -237,7 +243,7 @@ impl Op {
}
);

kernel(self #(, #call_args)*)
#kernel_call
}
}
}
Expand Down Expand Up @@ -641,7 +647,7 @@ const FLOAT_OPS: &[Op] = &[
"Compute an approximate reciprocal (`1. / x`) for each element.\n\n\
This uses a fast hardware estimate where available, and falls back to exact division otherwise.\n\n\
On x86 for `f32`, this has a relative error less than `1.5 × 2^-12`. \
On AArch64 (`f32` and `f64`), this has a relative error less than `2^-8`. \
On `AArch64` (`f32` and `f64`), this has a relative error less than `2^-8`. \
The precision of this operation may change as new platform support is added.",
),
Op::new(
Expand Down Expand Up @@ -1347,7 +1353,13 @@ pub(crate) fn ops_for_type(ty: &VecType) -> Vec<Op> {
block_size: 128,
block_count: 4,
},
"Load elements from an array with 4-way interleaving.\n\nReads consecutive elements and deinterleaves them into a single vector.",
"Load elements from an array with 4-way interleaving.\n\n\
This is different from loading a vector and calling `interleave`: `interleave` combines two already-loaded \
vectors, while this operation treats memory as four consecutive 128-bit blocks and transposes those blocks \
into one vector.\n\n\
For example, with 32-bit lanes, memory laid out as \
`[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]` loads as \
`[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]`.",
));
}

Expand All @@ -1359,7 +1371,13 @@ pub(crate) fn ops_for_type(ty: &VecType) -> Vec<Op> {
block_size: 128,
block_count: 4,
},
"Store elements to an array with 4-way interleaving.\n\nInterleaves the vector elements and writes them consecutively to memory.",
"Store elements to an array with 4-way interleaving.\n\n\
This is the inverse of `load_interleaved_128`. It is different from calling `interleave` and then storing: \
`interleave` combines two already-loaded vectors, while this operation transposes one vector into four \
consecutive 128-bit blocks in memory.\n\n\
For example, with 32-bit lanes, a vector containing \
`[a0, b0, c0, d0, a1, b1, c1, d1, a2, b2, c2, d2, a3, b3, c3, d3]` stores as \
`[a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, c2, c3, d0, d1, d2, d3]`.",
));
}

Expand Down
44 changes: 44 additions & 0 deletions fearless_simd_tests/tests/harness/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -816,6 +816,50 @@ fn all_false_mask8x16<S: Simd>(simd: S) {
assert!(!simd.all_false_mask8x16(one_neg));
}

#[simd_test]
fn load_interleaved_128_f32x16<S: Simd>(simd: S) {
let data = [
0.0,
4.0,
8.0,
f32::MIN,
f32::NAN,
-0.0,
9.0,
13.0,
f32::INFINITY,
6.0,
-10.0,
f32::MAX,
-3.0,
f32::NEG_INFINITY,
11.0,
15.0,
];
let expected = [
0.0,
f32::NAN,
f32::INFINITY,
-3.0,
4.0,
-0.0,
6.0,
f32::NEG_INFINITY,
8.0,
9.0,
-10.0,
11.0,
f32::MIN,
13.0,
f32::MAX,
15.0,
];

// Note: f32::NAN != f32::NAN hence we compare the bit pattern.
let result = simd.load_interleaved_128_f32x16(&data);
assert_eq!((*result).map(f32::to_bits), expected.map(f32::to_bits),);
}

#[simd_test]
fn load_interleaved_128_u32x16<S: Simd>(simd: S) {
#[rustfmt::skip]
Expand Down
Loading