Skip to content

Commit b36794b

Browse files
authored
Pass dask chunks to rasterize in clip_polygon to keep mask lazy (#1207) (#1209)
clip_polygon() was calling rasterize() without chunks=, so the polygon mask was always built as a full numpy array even for dask-backed inputs. For large rasters this defeats chunked evaluation and OOMs. Now extracts chunk sizes from the input raster and passes them through so rasterize uses its _run_dask_numpy path. Also sets use_cuda=True for dask+cupy inputs.
1 parent c4bb608 commit b36794b

2 files changed

Lines changed: 42 additions & 2 deletions

File tree

xrspatial/polygon_clip.py

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -198,8 +198,9 @@ def clip_polygon(
198198
raster = _crop_to_bbox(raster, geom_pairs, all_touched=all_touched)
199199

200200
# Build a binary mask via rasterize, aligned to the (possibly cropped)
201-
# raster grid. Always produce a plain numpy mask first, then convert
202-
# to the raster's backend so xarray's .where() sees matching types.
201+
# raster grid. Propagate the raster's chunk structure so the mask is
202+
# built lazily for dask backends instead of materializing a full numpy
203+
# array.
203204
from .rasterize import rasterize
204205

205206
kw = dict(rasterize_kw or {})
@@ -208,6 +209,12 @@ def clip_polygon(
208209
kw['dtype'] = np.uint8
209210
kw['all_touched'] = all_touched
210211

212+
if has_dask_array() and isinstance(raster.data, da.Array):
213+
rc, cc = raster.data.chunks[-2], raster.data.chunks[-1]
214+
kw.setdefault('chunks', (rc[0], cc[0]))
215+
if has_cuda_and_cupy() and is_dask_cupy(raster):
216+
kw.setdefault('use_cuda', True)
217+
211218
mask = rasterize(geom_pairs, **kw)
212219

213220
# Apply the mask. Keep it lazy for dask backends to avoid

xrspatial/tests/test_polygon_clip.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -325,3 +325,36 @@ def test_geoseries_input(self):
325325
gs = geopandas.GeoSeries([poly])
326326
result = clip_polygon(raster, gs, crop=False)
327327
assert result.shape == raster.shape
328+
329+
330+
# ---------------------------------------------------------------------------
331+
# Issue #1207 regression tests
332+
# ---------------------------------------------------------------------------
333+
334+
@dask_array_available
335+
class TestClipPolygonDaskLazyMask:
336+
def test_mask_stays_lazy_for_dask_input(self):
337+
"""clip_polygon on dask input should not materialize a full numpy mask (#1207).
338+
339+
We verify by checking that the dask task graph contains rasterize
340+
chunk tasks (not just a single from_array wrapping a pre-computed
341+
numpy array).
342+
"""
343+
import dask.array as da
344+
345+
dk_raster = _make_raster(backend='dask+numpy', chunks=(4, 3))
346+
poly = _inner_polygon()
347+
348+
result = clip_polygon(dk_raster, poly, crop=False)
349+
assert isinstance(result.data, da.Array)
350+
351+
# With chunked rasterize, the graph has tasks per chunk.
352+
# With the old approach (numpy mask + da.from_array), the graph
353+
# would have fewer chunk-level tasks for the mask.
354+
graph = dict(result.data.__dask_graph__())
355+
# At minimum, a 8x6 raster with (4,3) chunks = 2x2 = 4 mask chunks
356+
# plus raster chunks plus where-condition tasks.
357+
# Just verify we have more than the trivial single-mask case.
358+
assert len(graph) > 4, (
359+
f"graph has only {len(graph)} tasks; mask may not be chunked"
360+
)

0 commit comments

Comments
 (0)