Skip to content

Commit 2682a5c

Browse files
author
Ian
committed
Version upgrade to 1.0.2
- Optimized randomized SVD implementation, about 2x speedup - Upgraded single-utilities
1 parent a16e406 commit 2682a5c

5 files changed

Lines changed: 220 additions & 137 deletions

File tree

Cargo.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ description = "A Rust port of LAS2 from SVDLIBC"
44
keywords = ["svd"]
55
categories = ["algorithms", "data-structures", "mathematics", "science"]
66
name = "single-svdlib"
7-
version = "1.0.1"
7+
version = "1.0.2"
88
edition = "2021"
99
license-file = "SVDLIBC-LICENSE.txt"
1010

@@ -18,4 +18,4 @@ rayon = "1.10.0"
1818
thiserror = "2.0.9"
1919
nshare = {version = "0.10.0", features = ["nalgebra", "ndarray"] }
2020
ndarray = "0.16.1"
21-
single-utilities = "0.4.0"
21+
single-utilities = "0.7.0"

src/lanczos/masked.rs

Lines changed: 75 additions & 81 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
1+
use rayon::iter::IndexedParallelIterator;
12
use crate::utils::determine_chunk_size;
23
use crate::{SMat, SvdFloat};
34
use nalgebra_sparse::CsrMatrix;
45
use num_traits::Float;
56
use rayon::iter::ParallelIterator;
6-
use rayon::prelude::{IntoParallelIterator, ParallelBridge};
7+
use rayon::prelude::{IntoParallelIterator, ParallelBridge, ParallelSliceMut};
78
use std::ops::AddAssign;
89

910
pub struct MaskedCSRMatrix<'a, T: Float> {
@@ -86,7 +87,6 @@ impl<'a, T: Float + AddAssign + Sync + Send> SMat<T> for MaskedCSRMatrix<'a, T>
8687
}
8788

8889
fn svd_opa(&self, x: &[T], y: &mut [T], transposed: bool) {
89-
// TODO parallelize me please
9090
let nrows = if transposed {
9191
self.ncols()
9292
} else {
@@ -117,93 +117,87 @@ impl<'a, T: Float + AddAssign + Sync + Send> SMat<T> for MaskedCSRMatrix<'a, T>
117117

118118
y.fill(T::zero());
119119

120-
let high_precision_mode = self.ensure_identical_results_mode();
121-
122120
if !transposed {
123-
if high_precision_mode && self.uses_all_columns() {
124-
// For small matrices using all columns, mimic the exact behavior of
125-
// the original implementation to ensure identical results
126-
for i in 0..self.matrix.nrows() {
127-
let mut sum = T::zero();
128-
for j in major_offsets[i]..major_offsets[i + 1] {
129-
let col = minor_indices[j];
130-
// For all-columns mode, we know all columns are included
131-
let masked_col = self.original_to_masked[col].unwrap();
132-
sum = sum + (values[j] * x[masked_col]);
133-
}
134-
y[i] = sum;
135-
}
136-
} else {
137-
let chunk_size = determine_chunk_size(self.matrix.nrows());
138-
y.chunks_mut(chunk_size).enumerate().par_bridge().for_each(
139-
|(chunk_idx, y_chunk)| {
140-
let start_row = chunk_idx * chunk_size;
141-
let end_row = (start_row + y_chunk.len()).min(self.matrix.nrows());
142-
143-
for i in start_row..end_row {
144-
let row_idx = i - start_row;
145-
let mut sum = T::zero();
146-
147-
for j in major_offsets[i]..major_offsets[i + 1] {
148-
let col = minor_indices[j];
149-
if let Some(masked_col) = self.original_to_masked[col] {
150-
sum += values[j] * x[masked_col];
151-
};
152-
}
153-
y_chunk[row_idx] = sum;
154-
}
155-
},
156-
);
121+
// A * x calculation
122+
let row_count = self.matrix.nrows();
123+
let (major_offsets, minor_indices, values) = self.matrix.csr_data();
124+
125+
let chunk_size = std::cmp::max(16, row_count / (rayon::current_num_threads() * 2));
126+
127+
let mut valid_indices = Vec::with_capacity(self.matrix.ncols());
128+
for col in 0..self.matrix.ncols() {
129+
valid_indices.push(self.original_to_masked[col]);
157130
}
158-
} else {
159-
// For the transposed case (A^T * x)
160-
if high_precision_mode && self.uses_all_columns() {
161-
// Clear the output vector first
162-
for yval in y.iter_mut() {
163-
*yval = T::zero();
164-
}
165-
166-
// Follow exact same order of operations as original implementation
167-
for i in 0..self.matrix.nrows() {
168-
let row_val = x[i];
169-
for j in major_offsets[i]..major_offsets[i + 1] {
170-
let col = minor_indices[j];
171-
let masked_col = self.original_to_masked[col].unwrap();
172-
y[masked_col] = y[masked_col] + (values[j] * row_val);
173-
}
174-
}
175-
} else {
176-
let nrows = self.matrix.nrows();
177-
let chunk_size = determine_chunk_size(nrows);
178-
let num_chunks = (nrows + chunk_size - 1) / chunk_size;
179-
let results: Vec<Vec<T>> = (0..chunk_size)
180-
.into_par_iter()
181-
.map(|chunk_idx| {
182-
let start = chunk_idx * chunk_size;
183-
let end = (start + chunk_size).min(nrows);
184-
185-
let mut local_y = vec![T::zero(); y.len()];
186-
for i in start..end {
187-
let row_val = x[i];
188-
for j in major_offsets[i]..major_offsets[i + 1] {
189-
let col = minor_indices[j];
190-
if let Some(masked_col) = self.original_to_masked[col] {
191-
local_y[masked_col] += values[j] * row_val;
131+
132+
y.par_chunks_mut(chunk_size)
133+
.enumerate()
134+
.for_each(|(chunk_idx, y_chunk)| {
135+
let start_row = chunk_idx * chunk_size;
136+
let end_row = (start_row + y_chunk.len()).min(row_count);
137+
138+
for i in start_row..end_row {
139+
let row_idx = i - start_row;
140+
let mut sum = T::zero();
141+
142+
let row_start = major_offsets[i];
143+
let row_end = major_offsets[i + 1];
144+
145+
let mut j = row_start;
146+
147+
while j + 4 <= row_end {
148+
for offset in 0..4 {
149+
let idx = j + offset;
150+
let col = minor_indices[idx];
151+
if let Some(masked_col) = valid_indices[col] {
152+
sum += values[idx] * x[masked_col];
192153
}
193154
}
155+
j += 4;
156+
}
157+
158+
while j < row_end {
159+
let col = minor_indices[j];
160+
if let Some(masked_col) = valid_indices[col] {
161+
sum += values[j] * x[masked_col];
162+
}
163+
j += 1;
194164
}
195-
local_y
196-
})
197-
.collect();
198-
199-
y.fill(T::zero());
200165

201-
for local_y in results {
202-
for (idx, val) in local_y.iter().enumerate() {
203-
if !val.is_zero() {
204-
y[idx] += *val;
166+
y_chunk[row_idx] = sum;
167+
}
168+
});
169+
} else {
170+
// A^T * x calculation
171+
let nrows = self.matrix.nrows();
172+
let chunk_size = determine_chunk_size(nrows);
173+
174+
// Process in parallel chunks
175+
let results: Vec<Vec<T>> = (0..((nrows + chunk_size - 1) / chunk_size))
176+
.into_par_iter()
177+
.map(|chunk_idx| {
178+
let start = chunk_idx * chunk_size;
179+
let end = (start + chunk_size).min(nrows);
180+
181+
let mut local_y = vec![T::zero(); y.len()];
182+
for i in start..end {
183+
let row_val = x[i];
184+
for j in major_offsets[i]..major_offsets[i + 1] {
185+
let col = minor_indices[j];
186+
if let Some(masked_col) = self.original_to_masked[col] {
187+
local_y[masked_col] += values[j] * row_val;
188+
}
205189
}
206190
}
191+
local_y
192+
})
193+
.collect();
194+
195+
// Combine results
196+
for local_y in results {
197+
for (idx, val) in local_y.iter().enumerate() {
198+
if !val.is_zero() {
199+
y[idx] += *val;
200+
}
207201
}
208202
}
209203
}

src/lib.rs

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,7 @@ mod simple_comparison_tests {
178178
randomized::PowerIterationNormalizer::QR,
179179
false,
180180
Some(42),
181+
false
181182
);
182183

183184
// Verify the computation succeeds on a highly sparse matrix
@@ -231,7 +232,8 @@ mod simple_comparison_tests {
231232
7, // power iterations
232233
randomized::PowerIterationNormalizer::QR, // use QR normalization
233234
false,
234-
Some(42), // random seed
235+
Some(42),
236+
false// random seed
235237
)
236238
});
237239

@@ -264,6 +266,7 @@ mod simple_comparison_tests {
264266
randomized::PowerIterationNormalizer::QR, // use QR normalization
265267
false,
266268
Some(42), // random seed
269+
false
267270
)
268271
});
269272

0 commit comments

Comments
 (0)