From badc08e56eaebeb73c8de2f7a5c2a65ff92e29bd Mon Sep 17 00:00:00 2001 From: WANG Rui Date: Sat, 9 May 2026 11:17:36 +0800 Subject: [PATCH] LoongArch SIMD: use immediate compare and simplify accumulator setup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ``` simd/lines_fwd/1 time: [2.8043 ns 2.8059 ns 2.8080 ns] thrpt: [339.63 MiB/s 339.88 MiB/s 340.08 MiB/s] change: time: [−0.0748% −0.0056% +0.0611%] (p = 0.88 > 0.05) thrpt: [−0.0610% +0.0056% +0.0749%] No change in performance detected. simd/lines_fwd/8 time: [7.0106 ns 7.0151 ns 7.0210 ns] thrpt: [1.0612 GiB/s 1.0621 GiB/s 1.0628 GiB/s] change: time: [−20.493% −20.385% −20.246%] (p = 0.00 < 0.05) thrpt: [+25.385% +25.605% +25.775%] Performance has improved. simd/lines_fwd/128 time: [34.654 ns 34.677 ns 34.705 ns] thrpt: [3.4349 GiB/s 3.4377 GiB/s 3.4399 GiB/s] change: time: [−3.9461% −3.7069% −3.3401%] (p = 0.00 < 0.05) thrpt: [+3.4556% +3.8496% +4.1083%] Performance has improved. simd/lines_fwd/1024 time: [53.767 ns 53.848 ns 53.933 ns] thrpt: [17.683 GiB/s 17.711 GiB/s 17.737 GiB/s] change: time: [+0.6588% +0.8136% +0.9597%] (p = 0.00 < 0.05) thrpt: [−0.9506% −0.8070% −0.6545%] Change within noise threshold. simd/lines_fwd/131072 time: [3.0838 µs 3.0854 µs 3.0876 µs] thrpt: [39.536 GiB/s 39.564 GiB/s 39.584 GiB/s] change: time: [+0.0684% +0.1379% +0.2125%] (p = 0.00 < 0.05) thrpt: [−0.2120% −0.1377% −0.0683%] Change within noise threshold. simd/lines_fwd/134217728 time: [4.3453 ms 4.3485 ms 4.3521 ms] thrpt: [28.721 GiB/s 28.746 GiB/s 28.766 GiB/s] change: time: [−19.888% −19.804% −19.711%] (p = 0.00 < 0.05) thrpt: [+24.550% +24.695% +24.826%] Performance has improved. ``` --- crates/edit/src/simd/lines_bwd.rs | 18 ++++++++---------- crates/edit/src/simd/lines_fwd.rs | 18 ++++++++---------- 2 files changed, 16 insertions(+), 20 deletions(-) diff --git a/crates/edit/src/simd/lines_bwd.rs b/crates/edit/src/simd/lines_bwd.rs index 581e69954b7..37a016906f9 100644 --- a/crates/edit/src/simd/lines_bwd.rs +++ b/crates/edit/src/simd/lines_bwd.rs @@ -206,7 +206,7 @@ unsafe fn lines_bwd_lasx( } } - let lf = lasx_xvrepli_b(b'\n' as i32); + const LF: i32 = b'\n' as i32; let line_stop = line_stop.min(line); let off = end.addr() & 31; if off != 0 && off < end.offset_from_unsigned(beg) { @@ -221,11 +221,10 @@ unsafe fn lines_bwd_lasx( let v3 = lasx_xvld::<64>(chunk_start as *const _); let v4 = lasx_xvld::<96>(chunk_start as *const _); - let mut sum = lasx_xvrepli_b(0); - sum = lasx_xvsub_b(sum, lasx_xvseq_b(v1, lf)); - sum = lasx_xvsub_b(sum, lasx_xvseq_b(v2, lf)); - sum = lasx_xvsub_b(sum, lasx_xvseq_b(v3, lf)); - sum = lasx_xvsub_b(sum, lasx_xvseq_b(v4, lf)); + let mut sum = lasx_xvneg_b(lasx_xvseqi_b::(v1)); + sum = lasx_xvsub_b(sum, lasx_xvseqi_b::(v2)); + sum = lasx_xvsub_b(sum, lasx_xvseqi_b::(v3)); + sum = lasx_xvsub_b(sum, lasx_xvseqi_b::(v4)); let sum = horizontal_sum(sum); let line_next = line - sum as CoordType; @@ -240,9 +239,9 @@ unsafe fn lines_bwd_lasx( while end.offset_from_unsigned(beg) >= 32 { let chunk_start = end.sub(32); let v = lasx_xvld::<0>(chunk_start as *const _); - let c = lasx_xvseq_b(v, lf); + let c = lasx_xvseqi_b::(v); - let ones = lasx_xvand_v(c, lasx_xvrepli_b(1)); + let ones = lasx_xvandi_b::<1>(c); let sum = horizontal_sum(ones); let line_next = line - sum as CoordType; @@ -295,8 +294,7 @@ unsafe fn lines_bwd_lsx( let v3 = lsx_vld::<32>(chunk_start as *const _); let v4 = lsx_vld::<48>(chunk_start as *const _); - let mut sum = lsx_vldi::<0>(); - sum = lsx_vsub_b(sum, lsx_vseqi_b::(v1)); + let mut sum = lsx_vneg_b(lsx_vseqi_b::(v1)); sum = lsx_vsub_b(sum, lsx_vseqi_b::(v2)); sum = lsx_vsub_b(sum, lsx_vseqi_b::(v3)); sum = lsx_vsub_b(sum, lsx_vseqi_b::(v4)); diff --git a/crates/edit/src/simd/lines_fwd.rs b/crates/edit/src/simd/lines_fwd.rs index 86f333992a7..c05d827d8d3 100644 --- a/crates/edit/src/simd/lines_fwd.rs +++ b/crates/edit/src/simd/lines_fwd.rs @@ -213,7 +213,7 @@ unsafe fn lines_fwd_lasx( } } - let lf = lasx_xvrepli_b(b'\n' as i32); + const LF: i32 = b'\n' as i32; let off = beg.align_offset(32); if off != 0 && off < end.offset_from_unsigned(beg) { (beg, line) = lines_fwd_fallback(beg, beg.add(off), line, line_stop); @@ -226,11 +226,10 @@ unsafe fn lines_fwd_lasx( let v3 = lasx_xvld::<64>(beg as *const _); let v4 = lasx_xvld::<96>(beg as *const _); - let mut sum = lasx_xvrepli_b(0); - sum = lasx_xvsub_b(sum, lasx_xvseq_b(v1, lf)); - sum = lasx_xvsub_b(sum, lasx_xvseq_b(v2, lf)); - sum = lasx_xvsub_b(sum, lasx_xvseq_b(v3, lf)); - sum = lasx_xvsub_b(sum, lasx_xvseq_b(v4, lf)); + let mut sum = lasx_xvneg_b(lasx_xvseqi_b::(v1)); + sum = lasx_xvsub_b(sum, lasx_xvseqi_b::(v2)); + sum = lasx_xvsub_b(sum, lasx_xvseqi_b::(v3)); + sum = lasx_xvsub_b(sum, lasx_xvseqi_b::(v4)); let sum = horizontal_sum(sum); let line_next = line + sum as CoordType; @@ -244,9 +243,9 @@ unsafe fn lines_fwd_lasx( while end.offset_from_unsigned(beg) >= 32 { let v = lasx_xvld::<0>(beg as *const _); - let c = lasx_xvseq_b(v, lf); + let c = lasx_xvseqi_b::(v); - let ones = lasx_xvand_v(c, lasx_xvrepli_b(1)); + let ones = lasx_xvandi_b::<1>(c); let sum = horizontal_sum(ones); let line_next = line + sum as CoordType; @@ -298,8 +297,7 @@ unsafe fn lines_fwd_lsx( let v3 = lsx_vld::<32>(beg as *const _); let v4 = lsx_vld::<48>(beg as *const _); - let mut sum = lsx_vldi(0); - sum = lsx_vsub_b(sum, lsx_vseqi_b::(v1)); + let mut sum = lsx_vneg_b(lsx_vseqi_b::(v1)); sum = lsx_vsub_b(sum, lsx_vseqi_b::(v2)); sum = lsx_vsub_b(sum, lsx_vseqi_b::(v3)); sum = lsx_vsub_b(sum, lsx_vseqi_b::(v4));