Skip to content

Commit 950c8ad

Browse files
author
Ian
committed
added new loading option
1 parent 4d8a90e commit 950c8ad

6 files changed

Lines changed: 451 additions & 62 deletions

File tree

src/chunked_loader.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@ use anndata::{
66
use nalgebra_sparse::{pattern::SparsityPattern, CsrMatrix};
77
use ndarray::Ix1;
88

9-
use crate::{converter::LoadingConfig, utils::{read_array_as_usize_optimized, read_array_slice_as_usize}};
9+
use crate::{utils::{read_array_as_usize_optimized, read_array_slice_as_usize}, LoadingConfig};
1010

1111
pub fn load_csr_chunked<B: Backend>(
1212
container: &DataContainer<B>,

src/converter.rs

Lines changed: 0 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -13,25 +13,6 @@ use crate::{
1313
IMAnnData, IMArrayElement, IMElementCollection,
1414
};
1515

16-
#[derive(Clone, Debug)]
17-
pub struct LoadingConfig {
18-
pub use_chunked_loading: bool,
19-
pub chunk_size_mb: usize,
20-
pub memory_threshold_mb: usize,
21-
pub show_progress: bool,
22-
}
23-
24-
impl Default for LoadingConfig {
25-
fn default() -> Self {
26-
Self {
27-
use_chunked_loading: false,
28-
chunk_size_mb: 100,
29-
memory_threshold_mb: 1024,
30-
show_progress: true,
31-
}
32-
}
33-
}
34-
3516
pub fn convert_to_in_memory<B: Backend>(anndata: AnnData<B>) -> anyhow::Result<IMAnnData> {
3617
let obs_df = anndata.read_obs()?;
3718
let obs_names = anndata.obs_names();

src/lib.rs

Lines changed: 30 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ mod converter;
44
pub(crate) mod utils;
55
pub(crate) mod chunked_loader;
66
pub(crate) mod optimized_loader;
7+
mod loader;
78

89
pub use ad::IMAnnData;
910
pub use ad::helpers::IMArrayElement;
@@ -14,4 +15,32 @@ pub use ad::helpers::IMAxisArrays;
1415
pub use converter::convert_to_in_memory;
1516
pub use converter::convert_to_backed;
1617
pub use converter::convert_to_new_backed_h5;
17-
pub use base::DeepClone;
18+
pub use base::DeepClone;
19+
20+
#[derive(Clone, Debug)]
21+
pub enum LoadingStrategy {
22+
Auto,
23+
ForceComplete,
24+
ForceChunked,
25+
}
26+
27+
#[derive(Clone, Debug)]
28+
pub struct LoadingConfig {
29+
pub loading_strategy: LoadingStrategy,
30+
pub chunk_size_mb: usize,
31+
pub memory_threshold_mb: usize,
32+
pub show_progress: bool,
33+
}
34+
35+
impl Default for LoadingConfig {
36+
fn default() -> Self {
37+
Self {
38+
loading_strategy: LoadingStrategy::Auto,
39+
chunk_size_mb: 100,
40+
memory_threshold_mb: 1024,
41+
show_progress: true,
42+
}
43+
}
44+
}
45+
46+
pub use loader::{load_h5ad, load_h5ad_fast, load_h5ad_conservative, load_h5ad_with_config};

src/loader.rs

Lines changed: 275 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,275 @@
1+
use std::path::Path;
2+
3+
use anndata::{backend::DataContainer, data::DataFrameIndex, ArrayData, Backend, Data, Readable};
4+
use anndata_hdf5::{H5File, H5};
5+
use polars::frame::DataFrame;
6+
use anndata::backend::AttributeOp;
7+
use anndata::backend::GroupOp;
8+
9+
use crate::IMArrayElement;
10+
use crate::IMElement;
11+
use crate::LoadingStrategy;
12+
use crate::{chunked_loader::load_csr_chunked, optimized_loader::load_csr_optimized, utils::{read_dataframe_index, should_use_chunked_loading}, IMAnnData, LoadingConfig};
13+
14+
pub fn load_h5ad(h5_path: impl AsRef<Path>) -> anyhow::Result<IMAnnData> {
15+
load_h5ad_with_config(h5_path, LoadingConfig::default())
16+
}
17+
18+
pub fn load_h5ad_fast(h5_path: impl AsRef<Path>) -> anyhow::Result<IMAnnData> {
19+
let config = LoadingConfig {
20+
loading_strategy: LoadingStrategy::ForceComplete,
21+
chunk_size_mb: 256,
22+
memory_threshold_mb: 4096,
23+
show_progress: true,
24+
};
25+
26+
load_h5ad_with_config(h5_path, config)
27+
}
28+
29+
pub fn load_h5ad_conservative(h5_path: impl AsRef<Path>) -> anyhow::Result<IMAnnData> {
30+
let config = LoadingConfig {
31+
loading_strategy: LoadingStrategy::ForceChunked,
32+
chunk_size_mb: 64,
33+
memory_threshold_mb: 256,
34+
show_progress: true,
35+
};
36+
37+
load_h5ad_with_config(h5_path, config)
38+
}
39+
40+
pub fn load_h5ad_with_config(
41+
h5_path: impl AsRef<Path>,
42+
config: LoadingConfig,
43+
) -> anyhow::Result<IMAnnData> {
44+
let h5_file = H5::open(h5_path)?;
45+
46+
if config.show_progress {
47+
println!("Loading H5AD file directly...");
48+
}
49+
50+
// Load core components
51+
let (obs_df, obs_names) = load_obs(&h5_file, &config)?;
52+
let (var_df, var_names) = load_var(&h5_file, &config)?;
53+
let x_data = load_x_matrix(&h5_file, &config)?;
54+
55+
// Create the main structure
56+
let imad = IMAnnData::new_extended(x_data, obs_names, var_names, obs_df, var_df)?;
57+
58+
// Load optional components in parallel if beneficial
59+
load_axis_arrays(&h5_file, "obsm", imad.obsm(), &config)?;
60+
load_axis_arrays(&h5_file, "obsp", imad.obsp(), &config)?;
61+
load_axis_arrays(&h5_file, "varm", imad.varm(), &config)?;
62+
load_axis_arrays(&h5_file, "varp", imad.varp(), &config)?;
63+
load_axis_arrays(&h5_file, "layers", imad.layers(), &config)?;
64+
load_uns(&h5_file, imad.uns(), &config)?;
65+
66+
if config.show_progress {
67+
println!("H5AD file loaded successfully");
68+
}
69+
70+
Ok(imad)
71+
}
72+
73+
fn load_obs(
74+
h5_file: &H5File,
75+
config: &LoadingConfig
76+
) -> anyhow::Result<(DataFrame, Vec<String>)> {
77+
if config.show_progress {
78+
println!("Loading observations...");
79+
}
80+
81+
if !h5_file.exists("obs")? {
82+
return Ok((DataFrame::empty(), vec![]));
83+
}
84+
85+
let obs_container = DataContainer::open(h5_file, "obs")?;
86+
let obs_df: DataFrame = ArrayData::read(&obs_container)?.try_into()?;
87+
88+
let obs_index = read_dataframe_index(&obs_container)?;
89+
let obs_names = obs_index.into_vec();
90+
91+
if config.show_progress {
92+
println!(" {} observations loaded", obs_names.len());
93+
}
94+
95+
Ok((obs_df, obs_names))
96+
}
97+
98+
fn load_var(
99+
h5_file: &H5File,
100+
config: &LoadingConfig
101+
) -> anyhow::Result<(DataFrame, Vec<String>)> {
102+
if config.show_progress {
103+
println!("Loading variables...");
104+
}
105+
106+
if !h5_file.exists("var")? {
107+
return Ok((DataFrame::empty(), vec![]));
108+
}
109+
110+
let var_container = DataContainer::open(h5_file, "var")?;
111+
let var_df: DataFrame = ArrayData::read(&var_container)?.try_into()?;
112+
113+
let var_index = read_dataframe_index(&var_container)?;
114+
let var_names = var_index.into_vec();
115+
116+
if config.show_progress {
117+
println!(" {} variables loaded", var_names.len());
118+
}
119+
120+
Ok((var_df, var_names))
121+
}
122+
123+
fn load_x_matrix(
124+
h5_file: &anndata_hdf5::H5File,
125+
config: &LoadingConfig
126+
) -> anyhow::Result<ArrayData> {
127+
if !h5_file.link_exists("X") {
128+
if config.show_progress {
129+
println!("No X matrix found, using empty matrix");
130+
}
131+
return Ok(ArrayData::Array(
132+
anndata::data::DynArray::from(ndarray::Array2::<f64>::zeros((0, 0)))
133+
));
134+
}
135+
136+
if config.show_progress {
137+
println!("Loading X matrix...");
138+
}
139+
140+
let x_container = DataContainer::open(h5_file, "X")?;
141+
142+
let matrix_type = x_container.encoding_type()?;
143+
if config.show_progress {
144+
match &matrix_type {
145+
anndata::backend::DataType::CsrMatrix(_) => {
146+
let group = x_container.as_group()?;
147+
let shape: Vec<u64> = group.get_attr("shape")?;
148+
let nnz = group.open_dataset("data")?.shape()[0];
149+
println!(" CSR matrix: {}×{} with {} non-zeros", shape[0], shape[1], nnz);
150+
}
151+
anndata::backend::DataType::Array(_) => {
152+
let shape = x_container.as_dataset()?.shape();
153+
println!(" Dense matrix: {:?}", shape);
154+
}
155+
_ => {
156+
println!(" Matrix type: {:?}", matrix_type);
157+
}
158+
}
159+
}
160+
161+
let result = match matrix_type {
162+
anndata::backend::DataType::CsrMatrix(_) => {
163+
if should_use_chunked_loading(&x_container, config)? {
164+
if config.show_progress {
165+
println!(" Using chunked loading for large matrix");
166+
}
167+
load_csr_chunked(&x_container, config)?
168+
} else {
169+
if config.show_progress {
170+
println!(" Using optimized CSR loading");
171+
}
172+
load_csr_optimized(&x_container)?
173+
}
174+
}
175+
_ => {
176+
if config.show_progress {
177+
println!(" Using standard loading");
178+
}
179+
ArrayData::read(&x_container)?
180+
}
181+
};
182+
183+
if config.show_progress {
184+
println!(" X matrix loaded successfully");
185+
}
186+
187+
Ok(result)
188+
}
189+
190+
fn load_axis_arrays(
191+
h5_file: &anndata_hdf5::H5File,
192+
group_name: &str,
193+
target_arrays: crate::IMAxisArrays,
194+
config: &LoadingConfig,
195+
) -> anyhow::Result<()> {
196+
if !h5_file.link_exists(group_name) {
197+
return Ok(());
198+
}
199+
200+
let group = h5_file.open_group(group_name)?;
201+
let array_names = group.list()?;
202+
203+
if array_names.is_empty() {
204+
return Ok(());
205+
}
206+
207+
if config.show_progress {
208+
println!("Loading {} ({} items)...", group_name, array_names.len());
209+
}
210+
let arr_n_len = array_names.len();
211+
for array_name in array_names {
212+
if config.show_progress && arr_n_len > 3 {
213+
println!(" Loading {}/{}", group_name, array_name);
214+
}
215+
216+
let array_container = DataContainer::open(&group, &array_name)?;
217+
218+
let array_data = if group_name == "layers" {
219+
match array_container.encoding_type()? {
220+
anndata::backend::DataType::CsrMatrix(_) => {
221+
if should_use_chunked_loading(&array_container, config)? {
222+
if config.show_progress {
223+
println!(" Using chunked loading for layer {}", array_name);
224+
}
225+
load_csr_chunked(&array_container, config)?
226+
} else {
227+
load_csr_optimized(&array_container)?
228+
}
229+
}
230+
_ => ArrayData::read(&array_container)?,
231+
}
232+
} else {
233+
ArrayData::read(&array_container)?
234+
};
235+
236+
let im_array = IMArrayElement::new(array_data);
237+
target_arrays.add_array(array_name, im_array)?;
238+
}
239+
240+
if config.show_progress {
241+
println!(" {} loaded successfully", group_name);
242+
}
243+
244+
Ok(())
245+
}
246+
247+
fn load_uns(
248+
h5_file: &anndata_hdf5::H5File,
249+
target_uns: crate::IMElementCollection,
250+
config: &LoadingConfig,
251+
) -> anyhow::Result<()> {
252+
if !h5_file.link_exists("uns") {
253+
return Ok(());
254+
}
255+
256+
if config.show_progress {
257+
println!("Loading unstructured annotations...");
258+
}
259+
260+
let uns_group = h5_file.open_group("uns")?;
261+
let item_names = uns_group.list()?;
262+
263+
for item_name in item_names.iter() {
264+
let item_container = DataContainer::open(&uns_group, &item_name)?;
265+
let data = Data::read(&item_container)?;
266+
let im_element = IMElement::new(data);
267+
target_uns.add_data(item_name.clone(), im_element)?;
268+
}
269+
270+
if config.show_progress && !item_names.is_empty() {
271+
println!(" {} items loaded in uns", item_names.len());
272+
}
273+
274+
Ok(())
275+
}

0 commit comments

Comments
 (0)