diff --git a/conda-env.yml b/conda-env.yml index 80959ab9..02d61af7 100644 --- a/conda-env.yml +++ b/conda-env.yml @@ -13,7 +13,6 @@ dependencies: - jax>=0.4.19 - numba>=0.56 - numpy>=1.23 - - opera-utils>=0.4.1 - pydantic>=2.1 - pyproj>=3.3 - rasterio>=1.3 @@ -23,3 +22,5 @@ dependencies: - tqdm>=4.60 - typing_extensions>=3.10 - tyro>=0.9.20 + - pip: + - git+https://github.com/opera-adt/opera-utils.git@main diff --git a/requirements.txt b/requirements.txt index f8f4ffe3..221b268a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -3,7 +3,7 @@ h5py>=3.6 jax>=0.4.19 numba>=0.56 numpy>=1.23 -opera-utils>=0.4.1 +opera-utils>=0.25.7 pydantic>=2.1 pyproj>=3.3 rasterio>=1.3 diff --git a/src/dolphin/io/_core.py b/src/dolphin/io/_core.py index 94b4ff59..b4d6ff26 100644 --- a/src/dolphin/io/_core.py +++ b/src/dolphin/io/_core.py @@ -311,7 +311,9 @@ def _is_nisar_h5(filename: Filename) -> bool: def read_nisar_grid_metadata( filename: Filename, subdataset: str ) -> tuple[int, int, tuple[float, ...], int, str]: - """Read NISAR GSLC grid metadata via h5py. + """Read NISAR GSLC grid metadata via h5py (local) or. + + earthaccess+h5netcdf (remote). NISAR's GSLC files store the projection in a sibling ``projection`` dataset and grid coordinates in ``xCoordinates`` / ``yCoordinates`` @@ -325,12 +327,29 @@ def read_nisar_grid_metadata( Returns ``(nx, ny, geotransform, epsg, projection_wkt)``. """ + import re from pathlib import PurePosixPath - from osgeo import osr + from opera_utils import is_remote_url + from opera_utils._remote import open_h5 as open_remote_h5 + file_str = str(filename) + # Normalize malformed URLs missing one slash (e.g., https:/ -> https://) + file_str = re.sub(r"^(https?|s3):/(?!/)", r"\1://", file_str) grid_path = str(PurePosixPath(subdataset).parent) - with h5py.File(str(filename), "r") as f: + from osgeo import osr + + if is_remote_url(file_str): + if file_str.startswith("s3://"): + raise ValueError( + "Direct S3 links are not supported out-of-region." + " Please pass the HTTPS URL equivalent." + ) + h5_open = open_remote_h5(file_str) + else: + h5_open = h5py.File(file_str, "r") + + with h5_open as f: dset = f[subdataset] proj_name = dset.attrs.get("grid_mapping", "projection") if isinstance(proj_name, bytes): @@ -341,7 +360,8 @@ def read_nisar_grid_metadata( y_coords = f[f"{grid_path}/yCoordinates"][:] dx = float(f[f"{grid_path}/xCoordinateSpacing"][()]) dy = float(f[f"{grid_path}/yCoordinateSpacing"][()]) - # Cell-center coords -> upper-left edge: back off half a pixel. + + # 3. GeoTransform Calculation (Identical for both local and remote) gt = ( float(x_coords[0]) - dx / 2.0, dx, diff --git a/src/dolphin/io/_readers.py b/src/dolphin/io/_readers.py index 74997559..db187d77 100644 --- a/src/dolphin/io/_readers.py +++ b/src/dolphin/io/_readers.py @@ -21,7 +21,7 @@ import rasterio as rio import rasterio.windows from numpy.typing import ArrayLike -from opera_utils import get_dates, sort_files_by_date +from opera_utils import get_dates, is_remote_url, sort_files_by_date from osgeo import gdal from tqdm.auto import trange @@ -781,9 +781,19 @@ def __init__( if any(str(f).startswith("s3://") for f in file_list): files = [S3Path(str(f)) for f in file_list] elif use_abs_path: - files = [utils._resolve_gdal_path(p) for p in file_list] + # Skip resolving remote URLs (https://, http://) — Path.resolve() + # would prepend the cwd to them. + files = [ + p if is_remote_url(p) else utils._resolve_gdal_path(p) + for p in file_list + ] else: files = list(file_list) + + # Sanitize remote URLs: Path() collapses `https://` to `https:/`, + # which GDAL's HDF5 driver cannot resolve. Restore the missing slash. + files = [_sanitize_remote_url(f) for f in files] + # Extract the date/datetimes from the filenames dates = [get_dates(f, fmt=file_date_fmt) for f in file_list] if sort_files: @@ -1075,6 +1085,22 @@ def _parse_vrt_file(vrt_file): return filepaths, sds +def _sanitize_remote_url(file: Filename) -> Filename: + """Restore the missing slash on URLs that Path() has collapsed. + + ``Path("https://foo")`` round-trips to ``"https:/foo"`` because pathlib + collapses consecutive slashes. Return the URL as a plain string so the + double slash is preserved (wrapping it back in Path would collapse again). + """ + import re + + s = str(file) + fixed = re.sub(r"^(https?|s3):/(?!/)", r"\1://", s) + if fixed == s and not is_remote_url(s): + return file + return fixed + + def _assert_images_same_size(files): """Ensure all files are the same size.""" with ThreadPoolExecutor(5) as executor: diff --git a/src/dolphin/utils.py b/src/dolphin/utils.py index ce980ae7..b6f5e306 100644 --- a/src/dolphin/utils.py +++ b/src/dolphin/utils.py @@ -79,6 +79,8 @@ def _get_path_from_gdal_str(name: Filename) -> Path: def _resolve_gdal_path(gdal_str: Filename) -> Filename: """Resolve the file portion of a gdal-openable string to an absolute path.""" + from opera_utils import is_remote_url + s_clean = str(gdal_str).lstrip('"').lstrip("'").rstrip('"').rstrip("'") prefixes = ["DERIVED_SUBDATASET", "NETCDF", "HDF"] is_gdal_str = any(s_clean.upper().startswith(pre) for pre in prefixes) @@ -86,6 +88,9 @@ def _resolve_gdal_path(gdal_str: Filename) -> Filename: # strip quotes to add back in after file_part = file_part.strip('"').strip("'") + # Skip remote URLs — Path.resolve() would prepend the cwd to them. + if is_remote_url(file_part): + return gdal_str file_part_resolved = Path(file_part).resolve() resolved = s_clean.replace(file_part, str(file_part_resolved)) return Path(resolved) if not is_gdal_str else resolved diff --git a/src/dolphin/workflows/config/_common.py b/src/dolphin/workflows/config/_common.py index 6f6e39b7..85e8a7f4 100644 --- a/src/dolphin/workflows/config/_common.py +++ b/src/dolphin/workflows/config/_common.py @@ -7,6 +7,7 @@ from typing import Annotated, Any, Literal, Optional import tyro +from opera_utils import is_remote_url from pydantic import ( BaseModel, ConfigDict, @@ -540,8 +541,12 @@ def _read_file_list_or_glob(cls, value): # noqa: ARG001 filenames = [Path(f) for f in v_path.read_text().splitlines()] # If given as relative paths, make them relative to the text file + # Skip this for remote URLs (https://, http://, s3://) parent = v_path.parent - return [parent / f if not f.is_absolute() else f for f in filenames] + return [ + parent / f if (not f.is_absolute() and not is_remote_url(f)) else f + for f in filenames + ] else: msg = f"Input file list {v_path} does not exist or is not a file." raise ValueError(msg) diff --git a/src/dolphin/workflows/config/_displacement.py b/src/dolphin/workflows/config/_displacement.py index ce48ec12..22d968a6 100644 --- a/src/dolphin/workflows/config/_displacement.py +++ b/src/dolphin/workflows/config/_displacement.py @@ -5,7 +5,7 @@ from typing import Annotated, Any, Optional import tyro -from opera_utils import get_burst_id, get_dates, sort_files_by_date +from opera_utils import get_burst_id, get_dates, is_remote_url, sort_files_by_date from pydantic import ( BaseModel, ConfigDict, @@ -199,8 +199,11 @@ def model_post_init(self, context: Any, /) -> None: # Ensure outputs from workflow steps are within work directory. if not self.keep_paths_relative: - # Resolve all CSLC paths: - self.cslc_file_list = [p.resolve(strict=False) for p in self.cslc_file_list] + # Resolve all CSLC paths (skip remote URLs like https://, s3://): + self.cslc_file_list = [ + p if is_remote_url(p) else p.resolve(strict=False) + for p in self.cslc_file_list + ] work_dir = self.work_directory # For each workflow step that has an output folder, move it inside diff --git a/src/dolphin/workflows/config/_ps.py b/src/dolphin/workflows/config/_ps.py index bccc056a..c3d7178f 100644 --- a/src/dolphin/workflows/config/_ps.py +++ b/src/dolphin/workflows/config/_ps.py @@ -4,6 +4,7 @@ from pathlib import Path from typing import Any +from opera_utils import is_remote_url from pydantic import ConfigDict, Field, field_validator from ._common import ( @@ -62,8 +63,11 @@ def model_post_init(self, context: Any, /) -> None: super().model_post_init(context) # Ensure outputs from workflow steps are within work directory. if not self.keep_paths_relative: - # Resolve all CSLC paths: - self.cslc_file_list = [p.resolve(strict=False) for p in self.cslc_file_list] + # Resolve all CSLC paths (skip remote URLs like https://, s3://): + self.cslc_file_list = [ + p if is_remote_url(p) else p.resolve(strict=False) + for p in self.cslc_file_list + ] work_dir = self.work_directory # move the folders inside the work directory diff --git a/src/dolphin/workflows/displacement.py b/src/dolphin/workflows/displacement.py index 107767bc..5d0a246c 100755 --- a/src/dolphin/workflows/displacement.py +++ b/src/dolphin/workflows/displacement.py @@ -82,14 +82,35 @@ def _prepare_grouped_inputs(cfg: DisplacementWorkflow) -> _GroupedInputs: bounds mask inside ``wrapped_phase._get_mask`` clips them per block automatically when bounds are set. """ + import re + + def _normalize_url(path: Path) -> Path: + # Fix malformed URLs missing one slash (e.g., https:/foo -> https://foo). + path_str = str(path) + fixed = re.sub(r"^(https?|s3):/(?!/)", r"\1://", path_str) + return Path(fixed) if fixed != path_str else path + + cfg.cslc_file_list = [_normalize_url(p) for p in cfg.cslc_file_list] + if cfg.amplitude_dispersion_files: + cfg.amplitude_dispersion_files = [ + _normalize_url(p) for p in cfg.amplitude_dispersion_files + ] + if cfg.amplitude_mean_files: + cfg.amplitude_mean_files = [_normalize_url(p) for p in cfg.amplitude_mean_files] + if cfg.layover_shadow_mask_files: + cfg.layover_shadow_mask_files = [ + _normalize_url(p) for p in cfg.layover_shadow_mask_files + ] + is_opera_burst_mode = True try: grouped_slc_files = group_by_burst(cfg.cslc_file_list) block_bounds: dict[str, BlockBounds] = {} except ValueError as e: - if "Could not parse burst id" not in str(e): - raise - # Non-OPERA-burst inputs (e.g. NISAR full-frame GSLCs). + # Could not parse burst id or other ValueError - fall back to non-OPERA mode + logger.warning( + f"Unable to parse OPERA bursts ({e}), falling back to non-OPERA mode" + ) is_opera_burst_mode = False if cfg.input_options.azimuth_blocks > 1: # Split the single frame into N synthetic "bursts" for parallelism. @@ -209,7 +230,7 @@ def run( # ###################################### # 1. Burst-wise Wrapped phase estimation # ###################################### - logger.info(f"Found SLC files from {len(grouped_slc_files)} bursts") + logger.info(f"Found SLC files from {len(grouped_slc_files)} bursts/blocks") wrapped_phase_cfgs = [ ( burst, # Include the burst for logging purposes diff --git a/tests/test_goldstein.py b/tests/test_goldstein.py new file mode 100644 index 00000000..900e266e --- /dev/null +++ b/tests/test_goldstein.py @@ -0,0 +1,78 @@ +import numpy as np +import pytest + +from dolphin.goldstein import goldstein + + +@pytest.fixture +def smooth_complex(): + """A smooth, fully-valid complex interferogram patch.""" + ramp = np.linspace(0, 6, 64)[:, None] * np.ones((64, 64)) + return np.exp(1j * ramp).astype(np.complex64) + + +def test_output_shape_and_dtype_complex(smooth_complex): + out = goldstein(smooth_complex, alpha=0.5, psize=32) + assert out.shape == smooth_complex.shape + assert np.iscomplexobj(out) + + +def test_real_phase_input_returns_complex(smooth_complex): + """Real (float) phase input is wrapped to complex before filtering.""" + phase = np.angle(smooth_complex).astype(np.float64) + out = goldstein(phase, alpha=0.5, psize=32) + assert out.shape == phase.shape + assert np.iscomplexobj(out) + + +def test_alpha_zero_is_near_identity(smooth_complex): + """alpha=0 disables spectral weighting, so the overlap-add reconstructs input.""" + out = goldstein(smooth_complex, alpha=0.0, psize=32) + np.testing.assert_allclose(out, smooth_complex, atol=1e-5) + + +def test_negative_alpha_raises(smooth_complex): + with pytest.raises(ValueError, match="alpha must be >= 0"): + goldstein(smooth_complex, alpha=-0.1, psize=32) + + +def test_all_zero_returned_unchanged(): + data = np.zeros((40, 40), dtype=np.complex64) + out = goldstein(data, alpha=0.5, psize=32) + assert np.array_equal(out, data) + + +def test_all_nan_returned_unchanged(): + data = np.full((40, 40), np.nan, dtype=np.complex64) + out = goldstein(data, alpha=0.5, psize=32) + assert out.shape == data.shape + assert np.all(np.isnan(out)) + + +def test_empty_pixels_zeroed_in_output(smooth_complex): + """Zero-magnitude (invalid) input pixels are forced back to 0 on output.""" + data = smooth_complex.copy() + data[0, 0] = 0 + data[5, 7] = 0 + out = goldstein(data, alpha=0.5, psize=32) + assert out[0, 0] == 0 + assert out[5, 7] == 0 + + +def test_increasing_alpha_changes_output(smooth_complex): + """Higher alpha applies stronger filtering, producing a different result.""" + out_low = goldstein(smooth_complex, alpha=0.1, psize=32) + out_high = goldstein(smooth_complex, alpha=0.9, psize=32) + assert not np.allclose(out_low, out_high) + + +def test_non_divisible_shape_preserved(): + """Input dims that are not a multiple of psize are padded and cropped back.""" + rng = np.random.default_rng(42) + data = (rng.standard_normal((50, 37)) + 1j * rng.standard_normal((50, 37))).astype( + np.complex64 + ) + out = goldstein(data, alpha=0.5, psize=32) + assert out.shape == (50, 37) + # Fully valid input -> no pixel forced to zero by the empty mask + assert np.all(np.isfinite(out))