Skip to content

Commit 0131772

Browse files
committed
rimport: stage_data() now checks whether file can be downloaded.
1 parent 43250d7 commit 0131772

3 files changed

Lines changed: 142 additions & 4 deletions

File tree

rimport

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,13 +15,16 @@ import shutil
1515
import sys
1616
from pathlib import Path
1717
from typing import Iterable, List
18+
from urllib.request import Request, urlopen
19+
from urllib.error import HTTPError
1820

1921
import shared
2022

2123
DEFAULT_INPUTDATA_ROOT = Path(shared.DEFAULT_INPUTDATA_ROOT)
2224
DEFAULT_STAGING_ROOT = Path(shared.DEFAULT_STAGING_ROOT)
2325
STAGE_OWNER = "cesmdata"
2426
INDENT = " "
27+
INPUTDATA_URL = "https://osdf-data.gdex.ucar.edu/ncar/gdex/d651077/cesmdata/inputdata"
2528

2629

2730
def build_parser() -> argparse.ArgumentParser:
@@ -141,7 +144,9 @@ def normalize_paths(root: Path, relnames: Iterable[str]) -> List[Path]:
141144
return paths
142145

143146

144-
def stage_data(src: Path, inputdata_root: Path, staging_root: Path, check: bool = False) -> None:
147+
def stage_data(
148+
src: Path, inputdata_root: Path, staging_root: Path, check: bool = False
149+
) -> None:
145150
"""Stage a file by mirroring its path under `staging_root`.
146151
147152
Destination path is computed by replacing the `inputdata_root` prefix of `src`
@@ -173,6 +178,10 @@ def stage_data(src: Path, inputdata_root: Path, staging_root: Path, check: bool
173178
f"({staging_root})"
174179
)
175180
print(f"{INDENT}File is already published and linked.")
181+
if can_file_be_downloaded(src.resolve(), staging_root):
182+
print(f"{INDENT}File is available for download.")
183+
else:
184+
print(f"{INDENT}File is not (yet) available for download.")
176185
return
177186

178187
# TODO: Check whether file is published but not relinked.
@@ -254,6 +263,38 @@ def get_staging_root() -> Path:
254263
return DEFAULT_STAGING_ROOT
255264

256265

266+
def can_file_be_downloaded(file_relpath: Path, staging_root: Path, timeout: float = 10):
267+
"""Check whether a file is available for download from the CESM inputdata server.
268+
269+
Sends a HEAD request to the CESM inputdata URL to verify if the file exists and is
270+
accessible without downloading the entire file.
271+
272+
Args:
273+
file_relpath: Relative path to the file (relative to staging_root), or an absolute
274+
path that will be made relative to staging_root.
275+
staging_root: Root directory of the staging area, used to compute relative path
276+
if file_relpath is absolute.
277+
timeout: Maximum time in seconds to wait for the server response. Default is 10.
278+
279+
Returns:
280+
bool: True if the file is accessible (HTTP status 2xx or 3xx), False otherwise
281+
(including 404, network errors, timeouts, etc.).
282+
"""
283+
# Get URL
284+
if file_relpath.is_absolute():
285+
file_relpath = file_relpath.relative_to(staging_root)
286+
url = os.path.join(INPUTDATA_URL, file_relpath)
287+
288+
# Check whether URL can be accessed
289+
req = Request(url, method="HEAD")
290+
try:
291+
with urlopen(req, timeout=timeout) as resp:
292+
return 200 <= resp.status < 400
293+
except HTTPError:
294+
# Server reached, but resource doesn't exist (404, 410, etc.)
295+
return False
296+
297+
257298
def main(argv: List[str] | None = None) -> int:
258299
"""Main entry point for the rimport tool.
259300
Lines changed: 61 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,61 @@
1+
"""
2+
Tests for can_file_be_downloaded() function in rimport script.
3+
"""
4+
5+
import os
6+
import sys
7+
import importlib.util
8+
from importlib.machinery import SourceFileLoader
9+
from pathlib import Path
10+
11+
from shared import DEFAULT_STAGING_ROOT
12+
13+
# Import rimport module from file without .py extension
14+
rimport_path = os.path.join(
15+
os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))),
16+
"rimport",
17+
)
18+
loader = SourceFileLoader("rimport", rimport_path)
19+
spec = importlib.util.spec_from_loader("rimport", loader)
20+
if spec is None:
21+
raise ImportError(f"Could not create spec for rimport from {rimport_path}")
22+
rimport = importlib.util.module_from_spec(spec)
23+
sys.modules["rimport"] = rimport
24+
loader.exec_module(rimport)
25+
26+
RELPATH_THAT_DOES_EXIST = os.path.join(
27+
"share", "meshes", "ne3pg3_ESMFmesh_c221214_cdf5.asc"
28+
)
29+
30+
31+
class TestCanFileBeDownloaded:
32+
"""Test suite for can_file_be_downloaded() function."""
33+
34+
def test_existing_file_exists(self):
35+
"""Test that the file that should exist does. If not, other tests will definitely fail."""
36+
file_abspath = Path(os.path.join(DEFAULT_STAGING_ROOT, RELPATH_THAT_DOES_EXIST))
37+
assert file_abspath.exists()
38+
39+
def test_true_abspath(self):
40+
"""Test that can_file_be_downloaded() is true for an existing file given absolute path"""
41+
file_abspath = Path(os.path.join(DEFAULT_STAGING_ROOT, RELPATH_THAT_DOES_EXIST))
42+
assert rimport.can_file_be_downloaded(
43+
file_abspath,
44+
DEFAULT_STAGING_ROOT,
45+
)
46+
47+
def test_true_relpath(self):
48+
"""Test that can_file_be_downloaded() is true for an existing file given relative path"""
49+
file_relpath = Path(RELPATH_THAT_DOES_EXIST)
50+
assert rimport.can_file_be_downloaded(
51+
file_relpath,
52+
DEFAULT_STAGING_ROOT,
53+
)
54+
55+
def test_false_nonexistent(self):
56+
"""Test that can_file_be_downloaded() is false for a nonexistent file"""
57+
file_relpath = Path("weurueridniduafnea/smfnigsroerij/msdif8ernnr.nc")
58+
assert not rimport.can_file_be_downloaded(
59+
file_relpath,
60+
DEFAULT_STAGING_ROOT,
61+
)

tests/rimport/test_stage_data.py

Lines changed: 39 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -88,12 +88,12 @@ def test_preserves_directory_structure(self, inputdata_root, staging_root):
8888
assert dst.exists()
8989
assert dst.read_text() == "nested data"
9090

91-
def test_prints_live_symlink_already_published(
91+
def test_prints_live_symlink_already_published_not_downloadable(
9292
self, inputdata_root, staging_root, capsys
9393
):
9494
"""
9595
Test that staging a live, already-published symlink prints a message and returns
96-
immediately without copying anything.
96+
immediately without copying anything. Should say it's not available for download.
9797
"""
9898
# Create a real file in staging and a symlink to it in inputdata
9999
real_file = staging_root / "real_file.nc"
@@ -106,10 +106,12 @@ def test_prints_live_symlink_already_published(
106106
# Should print message for live symlink and return early
107107
rimport.stage_data(src, inputdata_root, staging_root)
108108

109-
# Verify the right message was printed
109+
# Verify the right messages were printed
110110
stdout = capsys.readouterr().out.strip()
111111
msg = "File is already published and linked"
112112
assert msg in stdout
113+
msg = "File is not (yet) available for download"
114+
assert msg in stdout
113115

114116
# Verify the WRONG message was NOT printed
115117
msg = "is already under staging directory"
@@ -118,6 +120,40 @@ def test_prints_live_symlink_already_published(
118120
# Verify that shutil.copy2 was never called (function returned early)
119121
mock_copy.assert_not_called()
120122

123+
def test_prints_live_symlink_already_published_is_downloadable(
124+
self, inputdata_root, staging_root, capsys
125+
):
126+
"""
127+
Like test_prints_live_symlink_already_published_not_downloadable, but mocks
128+
can_file_be_downloaded() to test "is available for download" message.
129+
"""
130+
# Create a real file in staging and a symlink to it in inputdata
131+
real_file = staging_root / "real_file.nc"
132+
real_file.write_text("data")
133+
src = inputdata_root / "link.nc"
134+
src.symlink_to(real_file)
135+
136+
# Mock shutil.copy2 to verify it's never called
137+
with patch("shutil.copy2") as mock_copy:
138+
# Mock can_file_be_downloaded to return True
139+
with patch("rimport.can_file_be_downloaded", return_value=True):
140+
# Should print message for live symlink and return early
141+
rimport.stage_data(src, inputdata_root, staging_root)
142+
143+
# Verify that shutil.copy2 was never called (function returned early)
144+
mock_copy.assert_not_called()
145+
146+
# Verify the right messages were printed
147+
stdout = capsys.readouterr().out.strip()
148+
msg = "File is already published and linked"
149+
assert msg in stdout
150+
msg = "File is available for download"
151+
assert msg in stdout
152+
153+
# Verify the WRONG message was NOT printed
154+
msg = "is already under staging directory"
155+
assert msg not in stdout
156+
121157
def test_raises_error_for_live_symlink_pointing_somewhere_other_than_staging(
122158
self, tmp_path, inputdata_root, staging_root
123159
):

0 commit comments

Comments
 (0)