Skip to content

Commit 39e9f64

Browse files
committed
Enhance benchmarks: add --ignore-hardware option to restore full fail/warn behavior
1 parent 44cfbdf commit 39e9f64

4 files changed

Lines changed: 122 additions & 45 deletions

File tree

tests/benchmarks/baselines.json

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@
5555
"n": 15,
5656
"updated_at": "2026-04-04T13:16:00.474681+00:00"
5757
},
58-
"py_update_2d_64x64": {
58+
"py_set_data_2d_64x64": {
5959
"min_ms": 0.473,
6060
"mean_ms": 0.767,
6161
"max_ms": 1.102,
@@ -146,28 +146,28 @@
146146
"n": 15,
147147
"updated_at": "2026-04-04T13:16:00.020751+00:00"
148148
},
149-
"py_update_2d_256x256": {
149+
"py_set_data_2d_256x256": {
150150
"min_ms": 1.593,
151151
"mean_ms": 4.177,
152152
"max_ms": 10.228,
153153
"n": 15,
154154
"updated_at": "2026-04-04T13:16:00.592322+00:00"
155155
},
156-
"py_update_2d_512x512": {
156+
"py_set_data_2d_512x512": {
157157
"min_ms": 5.114,
158158
"mean_ms": 6.513,
159159
"max_ms": 7.349,
160160
"n": 15,
161161
"updated_at": "2026-04-04T13:16:00.715109+00:00"
162162
},
163-
"py_update_2d_1024x1024": {
163+
"py_set_data_2d_1024x1024": {
164164
"min_ms": 8.407,
165165
"mean_ms": 8.954,
166166
"max_ms": 10.195,
167167
"n": 15,
168168
"updated_at": "2026-04-04T13:16:00.916898+00:00"
169169
},
170-
"py_update_2d_2048x2048": {
170+
"py_set_data_2d_2048x2048": {
171171
"min_ms": 31.763,
172172
"mean_ms": 34.136,
173173
"max_ms": 38.112,

tests/conftest.py

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,17 @@ def pytest_addoption(parser):
4646
default=False,
4747
help="Include slow benchmark scenarios (4096², 8192² images) skipped in fast CI",
4848
)
49+
parser.addoption(
50+
"--ignore-hardware",
51+
action="store_true",
52+
default=False,
53+
help=(
54+
"Treat the current machine as matching the baseline host, restoring "
55+
"full fail/warn behaviour even on different hardware. By default, "
56+
"benchmarks still run and compare on mismatched hardware but "
57+
"regressions are downgraded from failures to warnings."
58+
),
59+
)
4960
parser.addoption(
5061
"--baselines-path",
5162
default=None,
@@ -302,6 +313,19 @@ def run_slow(request):
302313
return request.config.getoption("--run-slow")
303314

304315

316+
@pytest.fixture(scope="session")
317+
def ignore_hardware(request):
318+
"""True when --ignore-hardware was passed.
319+
320+
By default, benchmark comparisons run on every machine but regressions
321+
that would normally cause a *failure* are downgraded to *warnings* when
322+
the current hostname doesn't match ``_meta.host`` in ``baselines.json``.
323+
Pass ``--ignore-hardware`` to restore full fail behaviour regardless of
324+
which machine is running the tests.
325+
"""
326+
return request.config.getoption("--ignore-hardware")
327+
328+
305329
@pytest.fixture
306330
def bench_page(_pw_browser):
307331
"""Fixture: open a widget in headless Chromium and return the live Page.

tests/test_benchmarks.py

Lines changed: 51 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -86,7 +86,8 @@ def _save_baselines(data: dict) -> None:
8686

8787
def _check_or_update(name: str, timing: dict, update: bool,
8888
fail_ratio: float = FAIL_RATIO,
89-
warn_ratio: float = WARN_RATIO) -> None:
89+
warn_ratio: float = WARN_RATIO,
90+
ignore_hardware: bool = False) -> None:
9091
"""Assert timing is within threshold of stored baseline, or write it.
9192
9293
Parameters
@@ -99,6 +100,15 @@ def _check_or_update(name: str, timing: dict, update: bool,
99100
benchmarks use 2.5× because Playwright mouse-event timing
100101
is more variable under OS scheduler load.
101102
warn_ratio : ratio above which a warning (not failure) is emitted.
103+
ignore_hardware : when True, treat the current machine as matching the
104+
baseline host and apply full fail/warn behaviour.
105+
106+
Hardware matching
107+
-----------------
108+
When the current hostname differs from ``_meta.host`` in the baseline file
109+
the test still runs and compares, but any result that would normally be a
110+
*failure* is downgraded to a *warning*. Pass ``--ignore-hardware`` to
111+
restore full fail behaviour regardless of hostname.
102112
"""
103113
if timing is None:
104114
pytest.skip(f"[{name}] No timing data returned (panel not found?)")
@@ -127,18 +137,33 @@ def _check_or_update(name: str, timing: dict, update: bool,
127137
f"[{name}] No baseline — run with --update-benchmarks to create one"
128138
)
129139

140+
# Determine whether we're on the same hardware as the baseline.
141+
meta = baselines.get("_meta", {})
142+
baseline_host = meta.get("host")
143+
current_host = socket.gethostname()
144+
hw_match = ignore_hardware or not baseline_host or (baseline_host == current_host)
145+
hw_note = (
146+
""
147+
if hw_match
148+
else f" [different hardware: baseline={baseline_host!r}, current={current_host!r}]"
149+
)
150+
130151
baseline = baselines[name]
131152
ratio = timing["mean_ms"] / baseline["mean_ms"]
132153

133154
if ratio > fail_ratio:
134-
pytest.fail(
155+
msg = (
135156
f"[{name}] REGRESSION: mean {timing['mean_ms']:.2f} ms vs "
136-
f"baseline {baseline['mean_ms']:.2f} ms ({ratio:.2f}×)"
157+
f"baseline {baseline['mean_ms']:.2f} ms ({ratio:.2f}×){hw_note}"
137158
)
138-
if ratio > warn_ratio:
159+
if hw_match:
160+
pytest.fail(msg)
161+
else:
162+
warnings.warn(msg, stacklevel=2)
163+
elif ratio > warn_ratio:
139164
warnings.warn(
140165
f"[{name}] Perf degraded: mean {timing['mean_ms']:.2f} ms vs "
141-
f"baseline {baseline['mean_ms']:.2f} ms ({ratio:.2f}×)",
166+
f"baseline {baseline['mean_ms']:.2f} ms ({ratio:.2f}×){hw_note}",
142167
stacklevel=2,
143168
)
144169

@@ -163,7 +188,7 @@ def _check_or_update(name: str, timing: dict, update: bool,
163188
_IMSHOW_SIZES,
164189
ids=[f"{h}x{w}" for h, w, _ in _IMSHOW_SIZES],
165190
)
166-
def test_bench_imshow(h, w, is_slow, bench_page, update_benchmarks, run_slow):
191+
def test_bench_imshow(h, w, is_slow, bench_page, update_benchmarks, run_slow, ignore_hardware):
167192
"""Render-time benchmark: imshow with {h}×{w} image data."""
168193
if is_slow and not run_slow:
169194
pytest.skip(f"Skipping {h}×{w} in fast CI — pass --run-slow to include")
@@ -189,7 +214,8 @@ def test_bench_imshow(h, w, is_slow, bench_page, update_benchmarks, run_slow):
189214
timeout=timeout_ms,
190215
)
191216

192-
_check_or_update(f"js_imshow_{h}x{w}", timing, update_benchmarks)
217+
_check_or_update(f"js_imshow_{h}x{w}", timing, update_benchmarks,
218+
ignore_hardware=ignore_hardware)
193219

194220

195221
# ── 1D plot benchmarks ────────────────────────────────────────────────────────
@@ -198,7 +224,7 @@ def test_bench_imshow(h, w, is_slow, bench_page, update_benchmarks, run_slow):
198224

199225

200226
@pytest.mark.parametrize("n_pts", _PLOT1D_SIZES, ids=[str(n) for n in _PLOT1D_SIZES])
201-
def test_bench_plot1d(n_pts, bench_page, update_benchmarks):
227+
def test_bench_plot1d(n_pts, bench_page, update_benchmarks, ignore_hardware):
202228
"""Render-time benchmark: plot1d with {n_pts} points."""
203229
rng = np.random.default_rng(1)
204230
fig, ax = apl.subplots(1, 1, figsize=(640, 320))
@@ -215,7 +241,8 @@ def test_bench_plot1d(n_pts, bench_page, update_benchmarks):
215241
n_samples=15,
216242
)
217243

218-
_check_or_update(f"js_plot1d_{n_pts}pts", timing, update_benchmarks)
244+
_check_or_update(f"js_plot1d_{n_pts}pts", timing, update_benchmarks,
245+
ignore_hardware=ignore_hardware)
219246

220247

221248
# ── pcolormesh benchmarks ─────────────────────────────────────────────────────
@@ -224,7 +251,7 @@ def test_bench_plot1d(n_pts, bench_page, update_benchmarks):
224251

225252

226253
@pytest.mark.parametrize("n", _MESH_SIZES, ids=[f"{n}x{n}" for n in _MESH_SIZES])
227-
def test_bench_pcolormesh(n, bench_page, update_benchmarks):
254+
def test_bench_pcolormesh(n, bench_page, update_benchmarks, ignore_hardware):
228255
"""Render-time benchmark: pcolormesh with {n}×{n} grid."""
229256
rng = np.random.default_rng(2)
230257
xe = np.linspace(0.0, 1.0, n + 1)
@@ -245,12 +272,13 @@ def test_bench_pcolormesh(n, bench_page, update_benchmarks):
245272
n_samples=15,
246273
)
247274

248-
_check_or_update(f"js_pcolormesh_{n}x{n}", timing, update_benchmarks)
275+
_check_or_update(f"js_pcolormesh_{n}x{n}", timing, update_benchmarks,
276+
ignore_hardware=ignore_hardware)
249277

250278

251279
# ── 3D surface benchmark ──────────────────────────────────────────────────────
252280

253-
def test_bench_plot3d(bench_page, update_benchmarks):
281+
def test_bench_plot3d(bench_page, update_benchmarks, ignore_hardware):
254282
"""Render-time benchmark: 3D surface (rotation interaction path)."""
255283
x = np.linspace(-2.0, 2.0, 48)
256284
y = np.linspace(-2.0, 2.0, 48)
@@ -272,13 +300,14 @@ def test_bench_plot3d(bench_page, update_benchmarks):
272300
n_samples=15,
273301
)
274302

275-
_check_or_update("js_plot3d_48x48", timing, update_benchmarks)
303+
_check_or_update("js_plot3d_48x48", timing, update_benchmarks,
304+
ignore_hardware=ignore_hardware)
276305

277306

278307
# ── bar chart benchmark ───────────────────────────────────────────────────────
279308

280309
@pytest.mark.parametrize("n_bars", [10, 100], ids=["10bars", "100bars"])
281-
def test_bench_bar(n_bars, bench_page, update_benchmarks):
310+
def test_bench_bar(n_bars, bench_page, update_benchmarks, ignore_hardware):
282311
"""Render-time benchmark: bar chart with {n_bars} bars."""
283312
rng = np.random.default_rng(3)
284313
fig, ax = apl.subplots(1, 1, figsize=(640, 320))
@@ -295,12 +324,13 @@ def test_bench_bar(n_bars, bench_page, update_benchmarks):
295324
n_samples=15,
296325
)
297326

298-
_check_or_update(f"js_bar_{n_bars}bars", timing, update_benchmarks)
327+
_check_or_update(f"js_bar_{n_bars}bars", timing, update_benchmarks,
328+
ignore_hardware=ignore_hardware)
299329

300330

301331
# ── interaction: 2D pan ───────────────────────────────────────────────────────
302332

303-
def test_bench_interaction_2d_pan(bench_page, update_benchmarks):
333+
def test_bench_interaction_2d_pan(bench_page, update_benchmarks, ignore_hardware):
304334
"""Interaction benchmark: 2D pan drag (20 mousemove events on 512² image)."""
305335
rng = np.random.default_rng(4)
306336
fig, ax = apl.subplots(1, 1, figsize=(512 + _PAD_L + _PAD_R,
@@ -343,9 +373,9 @@ def test_bench_interaction_2d_pan(bench_page, update_benchmarks):
343373

344374
timing = page.evaluate(f"() => window._aplTiming && window._aplTiming['{panel_id}']")
345375
_check_or_update("js_interaction_2d_pan", timing, update_benchmarks,
346-
fail_ratio=2.5, warn_ratio=1.75)
376+
fail_ratio=2.5, warn_ratio=1.75, ignore_hardware=ignore_hardware)
347377

348-
def test_bench_interaction_2d_zoom(bench_page, update_benchmarks):
378+
def test_bench_interaction_2d_zoom(bench_page, update_benchmarks, ignore_hardware):
349379
"""Interaction benchmark: 2D wheel zoom (20 wheel events on 512² image)."""
350380
rng = np.random.default_rng(5)
351381
fig, ax = apl.subplots(1, 1, figsize=(512 + _PAD_L + _PAD_R,
@@ -376,9 +406,9 @@ def test_bench_interaction_2d_zoom(bench_page, update_benchmarks):
376406

377407
timing = page.evaluate(f"() => window._aplTiming && window._aplTiming['{panel_id}']")
378408
_check_or_update("js_interaction_2d_zoom", timing, update_benchmarks,
379-
fail_ratio=2.5, warn_ratio=1.75)
409+
fail_ratio=2.5, warn_ratio=1.75, ignore_hardware=ignore_hardware)
380410

381-
def test_bench_interaction_1d_pan(bench_page, update_benchmarks):
411+
def test_bench_interaction_1d_pan(bench_page, update_benchmarks, ignore_hardware):
382412
"""Interaction benchmark: 1D pan drag (20 mousemove events, 10K points)."""
383413
rng = np.random.default_rng(6)
384414
pw, ph = 640, 320
@@ -414,6 +444,4 @@ def test_bench_interaction_1d_pan(bench_page, update_benchmarks):
414444

415445
timing = page.evaluate(f"() => window._aplTiming && window._aplTiming['{panel_id}']")
416446
_check_or_update("js_interaction_1d_pan", timing, update_benchmarks,
417-
fail_ratio=2.5, warn_ratio=1.75)
418-
419-
447+
fail_ratio=2.5, warn_ratio=1.75, ignore_hardware=ignore_hardware)

0 commit comments

Comments
 (0)