Skip to content

Commit f593ebd

Browse files
authored
Add H3 spatial indexing, two-tier facet loading, and benchmark optimizations (#5)
Add H3 spatial indexing, two-tier facet loading, and benchmark optimizations ## Changes - isamples_explorer.qmd: Two-tier facet loading (2KB summary for instant counts) - parquet_cesium_isamples_wide.qmd: Zoom-adaptive H3 clustering with LOD - zenodo_isamples_analysis.qmd: Data-driven H3 regional analysis - narrow_vs_wide_performance.qmd: Added geospatial and facet benchmarks ## Fixes Applied (Codex review) - Fixed MODE(n) → MODE(source) for cluster coloring - Added camera listener cleanup to prevent leaks - Added NaN guard for cluster label parsing - Added user-facing warning for facet summary failures Closes #1, #2, #3, #4
1 parent 4cd3164 commit f593ebd

4 files changed

Lines changed: 916 additions & 238 deletions

File tree

tutorials/isamples_explorer.qmd

Lines changed: 145 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ Search and explore **6.7 million physical samples** from scientific collections
1212

1313
::: {.callout-note}
1414
### Serverless Architecture
15-
This app queries a ~280 MB Parquet file directly in your browser using DuckDB-WASM. No server required!
15+
This app uses a **two-tier loading strategy**: a 2KB pre-computed summary loads instantly for facet counts (source, material, context, specimen type), while the full ~280 MB Parquet file is only queried when drilling into records. All powered by DuckDB-WASM in your browser -- no server required!
1616
:::
1717

1818
## Setup
@@ -28,6 +28,9 @@ duckdbModule = import("https://cdn.jsdelivr.net/npm/@duckdb/duckdb-wasm@1.28.0/+
2828
// Data source configuration
2929
parquet_url = "https://pub-a18234d962364c22a50c787b7ca09fa5.r2.dev/isamples_202601_wide.parquet"
3030
31+
// Pre-computed facet summaries (2KB - loads instantly)
32+
facet_summaries_url = "https://pub-a18234d962364c22a50c787b7ca09fa5.r2.dev/isamples_202601_facet_summaries.parquet"
33+
3134
// Source color scheme (consistent with iSamples conventions)
3235
SOURCE_COLORS = ({
3336
'SESAR': '#3366CC', // Blue
@@ -79,14 +82,18 @@ viewof searchInput = Inputs.text({
7982

8083
### Filters
8184

85+
```{ojs}
86+
facetSummariesWarning
87+
```
88+
8289
**Source**
8390

8491
```{ojs}
8592
//| code-fold: true
86-
// Source checkboxes with counts
93+
// Source checkboxes with counts - uses pre-computed summaries for instant load
8794
viewof sourceCheckboxes = {
88-
// Get source counts based on current search
89-
const counts = await sourceCounts;
95+
// Use pre-computed facet summaries (instant) instead of scanning full parquet
96+
const counts = facetsByType.source;
9097
const options = counts.map(r => r.value);
9198
9299
return Inputs.checkbox(options, {
@@ -104,6 +111,69 @@ viewof sourceCheckboxes = {
104111
}
105112
```
106113

114+
**Material**
115+
116+
```{ojs}
117+
//| code-fold: true
118+
// Material filter - loaded from pre-computed summaries
119+
viewof materialCheckboxes = {
120+
const counts = facetsByType.material;
121+
const options = counts.map(r => r.value);
122+
return Inputs.checkbox(options, {
123+
value: [],
124+
format: (x) => {
125+
const r = counts.find(s => s.value === x);
126+
const count = r ? Number(r.count).toLocaleString() : "0";
127+
return html`<span style="display: inline-flex; align-items: center; gap: 4px;">
128+
${x} <span style="color: #888; font-size: 11px;">(${count})</span>
129+
</span>`;
130+
}
131+
});
132+
}
133+
```
134+
135+
**Sampled Feature**
136+
137+
```{ojs}
138+
//| code-fold: true
139+
// Context filter - loaded from pre-computed summaries
140+
viewof contextCheckboxes = {
141+
const counts = facetsByType.context;
142+
const options = counts.map(r => r.value);
143+
return Inputs.checkbox(options, {
144+
value: [],
145+
format: (x) => {
146+
const r = counts.find(s => s.value === x);
147+
const count = r ? Number(r.count).toLocaleString() : "0";
148+
return html`<span style="display: inline-flex; align-items: center; gap: 4px;">
149+
${x} <span style="color: #888; font-size: 11px;">(${count})</span>
150+
</span>`;
151+
}
152+
});
153+
}
154+
```
155+
156+
**Specimen Type**
157+
158+
```{ojs}
159+
//| code-fold: true
160+
// Object type filter - loaded from pre-computed summaries
161+
viewof objectTypeCheckboxes = {
162+
const counts = facetsByType.object_type;
163+
const options = counts.map(r => r.value);
164+
return Inputs.checkbox(options, {
165+
value: [],
166+
format: (x) => {
167+
const r = counts.find(s => s.value === x);
168+
const count = r ? Number(r.count).toLocaleString() : "0";
169+
return html`<span style="display: inline-flex; align-items: center; gap: 4px;">
170+
${x} <span style="color: #888; font-size: 11px;">(${count})</span>
171+
</span>`;
172+
}
173+
});
174+
}
175+
```
176+
107177
```{ojs}
108178
//| code-fold: true
109179
html`<a href="?" style="font-size: 13px;">Clear All Filters</a>`
@@ -131,6 +201,9 @@ viewof maxSamples = Inputs.range([1000, 100000], {
131201
const params = new URLSearchParams();
132202
if (searchInput) params.set("q", searchInput);
133203
if (sourceCheckboxes?.length) params.set("sources", sourceCheckboxes.join(","));
204+
if (materialCheckboxes?.length) params.set("material", materialCheckboxes.join(","));
205+
if (contextCheckboxes?.length) params.set("context", contextCheckboxes.join(","));
206+
if (objectTypeCheckboxes?.length) params.set("object_type", objectTypeCheckboxes.join(","));
134207
if (viewMode !== "globe") params.set("view", viewMode);
135208
136209
const newUrl = params.toString() ? `?${params.toString()}` : window.location.pathname;
@@ -264,7 +337,50 @@ async function runQuery(sql) {
264337

265338
```{ojs}
266339
//| code-fold: true
267-
// Build WHERE clause from current filters
340+
// Tier 1: Load pre-computed facet summaries (2KB, instant)
341+
facetSummaries = {
342+
facetSummariesError = null;
343+
try {
344+
const rows = await runQuery(`SELECT * FROM read_parquet('${facet_summaries_url}')`);
345+
return rows;
346+
} catch (e) {
347+
console.error("Facet summaries load error:", e);
348+
facetSummariesError = e;
349+
return [];
350+
}
351+
}
352+
353+
```
354+
355+
```{ojs}
356+
//| code-fold: true
357+
facetSummariesWarning = {
358+
if (!facetSummariesError) return null;
359+
return html`<div style="margin: 6px 0 10px; padding: 8px 10px; border: 1px solid #f0b429; background: #fff7e6; border-radius: 6px; color: #7a4b00; font-size: 12px;">
360+
Facet summaries failed to load. Filter counts may be missing. Try refreshing.
361+
</div>`;
362+
}
363+
364+
// Extract facet counts by type from pre-computed summaries
365+
facetsByType = {
366+
const grouped = { source: [], material: [], context: [], object_type: [] };
367+
for (const row of facetSummaries) {
368+
const ft = row.facet_type;
369+
if (grouped[ft]) {
370+
grouped[ft].push({ value: row.facet_value, count: Number(row.count), scheme: row.scheme });
371+
}
372+
}
373+
// Sort each by count descending
374+
for (const key of Object.keys(grouped)) {
375+
grouped[key].sort((a, b) => b.count - a.count);
376+
}
377+
return grouped;
378+
}
379+
```
380+
381+
```{ojs}
382+
//| code-fold: true
383+
// Build WHERE clause from current filters (Tier 2: queries full parquet only when filtering)
268384
whereClause = {
269385
const conditions = [
270386
"otype = 'MaterialSampleRecord'",
@@ -288,40 +404,36 @@ whereClause = {
288404
conditions.push(`n IN (${sourceList})`);
289405
}
290406
407+
// Material filter
408+
const materials = Array.from(materialCheckboxes || []);
409+
if (materials.length > 0) {
410+
const matList = materials.map(m => `'${m.replace(/'/g, "''")}'`).join(", ");
411+
conditions.push(`has_material_category IN (${matList})`);
412+
}
413+
414+
// Context (sampled feature) filter
415+
const contexts = Array.from(contextCheckboxes || []);
416+
if (contexts.length > 0) {
417+
const ctxList = contexts.map(c => `'${c.replace(/'/g, "''")}'`).join(", ");
418+
conditions.push(`has_context_category IN (${ctxList})`);
419+
}
420+
421+
// Object type (specimen type) filter
422+
const objectTypes = Array.from(objectTypeCheckboxes || []);
423+
if (objectTypes.length > 0) {
424+
const otList = objectTypes.map(o => `'${o.replace(/'/g, "''")}'`).join(", ");
425+
conditions.push(`has_specimen_category IN (${otList})`);
426+
}
427+
291428
return conditions.join(" AND ");
292429
}
293430
```
294431

295432
```{ojs}
296433
//| code-fold: true
297-
// Get source facet counts (respects text search but not source filter)
298-
sourceCounts = {
299-
let baseWhere = "otype = 'MaterialSampleRecord' AND latitude IS NOT NULL";
300-
301-
if (searchInput?.trim()) {
302-
const term = searchInput.trim().replace(/'/g, "''");
303-
baseWhere += ` AND (
304-
label ILIKE '%${term}%'
305-
OR description ILIKE '%${term}%'
306-
OR CAST(place_name AS VARCHAR) ILIKE '%${term}%'
307-
)`;
308-
}
309-
310-
const query = `
311-
SELECT n as value, COUNT(*) as count
312-
FROM samples
313-
WHERE ${baseWhere}
314-
GROUP BY n
315-
ORDER BY count DESC
316-
`;
317-
318-
try {
319-
return await runQuery(query);
320-
} catch (e) {
321-
console.error("Facet query error:", e);
322-
return [];
323-
}
324-
}
434+
// Source counts now come from pre-computed facet summaries (Tier 1)
435+
// No longer scans the full parquet file on every page load
436+
sourceCounts = facetsByType.source
325437
```
326438

327439
```{ojs}

0 commit comments

Comments
 (0)