diff --git a/CHANGELOG.md b/CHANGELOG.md index 0d9c1de..c1c7744 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -13,6 +13,15 @@ per the process in [`docs/releasing.md`](docs/releasing.md). ### Added +- **Healthchecks.io dead-man's switch** (`healthchecks.*` in `config.json`, **default off**): + an optional external liveness monitor. When enabled, the dashboard loop pings a unique + Healthchecks.io URL every cycle; if the host dies (power loss, kernel panic, NIC death) the + pings stop and Healthchecks.io alerts you — the one failure mode an in-stack notifier can't + report from a dead machine. Optionally sends `/fail` while a required node is down + (`signal_fail_on_node_down`, on by default), supports self-hosted instances via `base_url`, + and fails silently when offline / Tor-only. Manual setup (paste the ping URL; the URL is + stored as a secret in the owner-only `.env`). See [`docs/monitoring.md`](docs/monitoring.md) + (#79). - Dashboard header shows the host's **IP address** next to the hostname when the configured `dashboard.host` is a name, as `hostname @ ip` (e.g. `pithead.local @ 192.168.1.42`), so you can still reach the dashboard when the hostname doesn't resolve from your phone or another machine on the LAN. The diff --git a/build/dashboard/mining_dashboard/config/config.py b/build/dashboard/mining_dashboard/config/config.py index 7aa4c3a..de31ef9 100644 --- a/build/dashboard/mining_dashboard/config/config.py +++ b/build/dashboard/mining_dashboard/config/config.py @@ -112,6 +112,38 @@ NODE_DOWN_AFTER_SEC = int(os.environ.get("NODE_DOWN_AFTER_SEC", 90)) NODE_RECOVERY_AFTER_SEC = int(os.environ.get("NODE_RECOVERY_AFTER_SEC", 60)) +# --- Healthchecks.io dead-man's switch (Issue #79) --- +# Optional external liveness monitor. When enabled, the dashboard loop pings a unique URL +# every cycle; if the whole host dies (power loss, kernel panic, NIC death) the dashboard +# dies with it, the pings stop, and Healthchecks.io alerts the operator on the *absence* of +# a ping — the one failure mode an in-stack notifier (#45) structurally can't report. +# Default OFF: with HEALTHCHECKS_ENABLED unset nothing ever pings and there are no errors. +HEALTHCHECKS_ENABLED = os.environ.get("HEALTHCHECKS_ENABLED", "false").strip().lower() == "true" + +# Manual mode (MVP): paste the full ping URL Healthchecks.io shows you, e.g. +# https://hc-ping.com/. A bare uuid/slug is also accepted and is joined onto +# HEALTHCHECKS_BASE_URL — which is what `base_url` is for: pointing at a *self-hosted* +# Healthchecks instance instead of the hosted hc-ping.com. No API key is stored (model B +# auto-provisioning is intentionally out of scope). +HEALTHCHECKS_PING_URL = os.environ.get("HEALTHCHECKS_PING_URL", "").strip() +HEALTHCHECKS_BASE_URL = os.environ.get("HEALTHCHECKS_BASE_URL", "https://hc-ping.com").strip() + +# How often to ping. The loop runs every UPDATE_INTERVAL, so this is a throttle floor — a +# value below UPDATE_INTERVAL just pings every cycle. Set your Healthchecks period + grace +# comfortably above this so a single missed cycle (e.g. a dashboard restart) doesn't alert. +try: + HEALTHCHECKS_INTERVAL_SEC = int(os.environ.get("HEALTHCHECKS_INTERVAL_SECONDS", "60")) +except ValueError: + HEALTHCHECKS_INTERVAL_SEC = 60 +if HEALTHCHECKS_INTERVAL_SEC < 0: + HEALTHCHECKS_INTERVAL_SEC = 0 + +# When true, send Healthchecks a `/fail` (instead of a success ping) while a *required* node +# is down — i.e. the same condition that rejects workers in #31 (monerod always; Tari only +# when TARI_REQUIRED). This makes the check health-aware: it goes red on a degraded-but-alive +# stack, not just a dead host. Set false for plain liveness (only host death trips it). +HEALTHCHECKS_FAIL_ON_NODE_DOWN = os.environ.get("HEALTHCHECKS_FAIL_ON_NODE_DOWN", "true").strip().lower() == "true" + # --- Monero Configuration --- # Used to determine if the node is local (Docker) or remote MONERO_NODE_HOST = os.environ.get("MONERO_NODE_HOST", "172.28.0.26") diff --git a/build/dashboard/mining_dashboard/service/data_service.py b/build/dashboard/mining_dashboard/service/data_service.py index d69169b..6ac0c66 100644 --- a/build/dashboard/mining_dashboard/service/data_service.py +++ b/build/dashboard/mining_dashboard/service/data_service.py @@ -16,6 +16,7 @@ from mining_dashboard.collector.logs import get_monero_sync_status from mining_dashboard.collector.system import get_disk_usage, get_hugepages_status, get_memory_usage, get_load_average, get_cpu_usage from mining_dashboard.service.node_health import NodeHealthMonitor +from mining_dashboard.service.healthchecks import HealthchecksClient logger = logging.getLogger("DataService") @@ -206,6 +207,11 @@ def __init__(self, state_manager, proxy_client, xvb_client): self.docker_control = DockerControl() self.monero_health = NodeHealthMonitor() self.tari_health = NodeHealthMonitor() + + # Healthchecks.io dead-man's switch (Issue #79). Disabled by default — when off this is + # a no-op. When on, each cycle pings a unique URL; the alert fires externally on the + # *absence* of a ping, so it survives a host death the in-stack notifier can't report. + self.healthchecks = HealthchecksClient.from_config() # True while we've stopped the proxy to reject workers. Persisted in the snapshot so # a dashboard restart mid-outage still readmits workers once the node recovers. self.workers_rejected = False @@ -478,6 +484,16 @@ async def run(self): snapshot_data.pop("shares", None) await asyncio.to_thread(self.state_manager.save_snapshot, snapshot_data) + # 6b. Healthchecks.io dead-man's switch (Issue #79). Ping each cycle so the + # external monitor alerts on the *absence* of a ping if the host ever dies + # (power loss, crash, NIC death). Send /fail instead while a *required* node + # is down — the same predicate as #31's worker rejection: monerod always, + # Tari only when required. The client throttles and fails silently; gate on + # `enabled` so a disabled (default) stack never spawns the worker thread. + if self.healthchecks.enabled: + required_node_down = monero_down or (tari_down and TARI_REQUIRED) + await asyncio.to_thread(self.healthchecks.ping, fail=required_node_down) + # 7. External API Sync (Throttled to every 10th iteration) if iteration_count % 10 == 0: real_xvb_stats = await asyncio.to_thread(self.xvb_client.get_stats) diff --git a/build/dashboard/mining_dashboard/service/healthchecks.py b/build/dashboard/mining_dashboard/service/healthchecks.py new file mode 100644 index 0000000..352d76e --- /dev/null +++ b/build/dashboard/mining_dashboard/service/healthchecks.py @@ -0,0 +1,145 @@ +"""Healthchecks.io dead-man's-switch pinger (Issue #79). + +A thin, self-contained client the data loop calls once per cycle to ping a unique +Healthchecks.io URL. The value is in what *stops* happening: if the host dies, the +dashboard dies with it, the pings stop, and Healthchecks.io fires an alert on the absence +of a ping — evaluated on *their* servers, so it survives the very outage (power loss, kernel +panic, NIC death) an in-stack notifier can't report from a dead machine. + +Design notes: +- **Default off.** A disabled client is a no-op: :meth:`ping` returns immediately, opens no + socket, and logs nothing. +- **Fails silently.** A ping that can't reach the endpoint (offline, or a Tor-only host that + can't reach clearnet hc-ping.com) is logged at DEBUG only — never WARNING/ERROR — so the + log stays quiet, consistent with the stack's offline check discipline (#59). A blank + ``ping_url`` while *enabled* is a genuine misconfiguration and warns once. +- **Throttled.** :data:`interval` is a floor between pings; the loop calls every cycle but we + only hit the network once per interval. The throttle clock only advances on a *successful* + send, so while offline we keep retrying every cycle rather than backing off. + +Manual setup only (MVP): the operator pastes the ping URL from Healthchecks.io. Auto- +provisioning via the Management API (which would mean storing a powerful API key) is +intentionally left out — see ``docs/monitoring.md``. +""" +import logging +import time + +import requests + +from mining_dashboard.config.config import ( + HEALTHCHECKS_ENABLED, + HEALTHCHECKS_PING_URL, + HEALTHCHECKS_BASE_URL, + HEALTHCHECKS_INTERVAL_SEC, + HEALTHCHECKS_FAIL_ON_NODE_DOWN, +) + +logger = logging.getLogger("Healthchecks") + +# A ping is a tiny request; keep the timeout short so a hung endpoint can't stall the loop's +# worker thread for long. Healthchecks.io recommends GET/HEAD/POST to the ping URL. +_PING_TIMEOUT_SEC = 10 + + +def _resolve_ping_url(ping_url, base_url): + """Resolve the configured ping URL into a full success endpoint, or ``""`` if unset. + + Two accepted shapes, so the same config works for hosted and self-hosted instances: + + - A full ``http(s)://...`` URL (what Healthchecks.io shows you) is used as-is. This already + carries the host, so self-hosted is supported by pasting the self-hosted URL — no + ``base_url`` needed. + - A bare uuid/slug is joined onto ``base_url`` (default ``https://hc-ping.com``; override it + to point at a self-hosted instance, e.g. ``https://hc.example.com/ping``). + + Trailing slashes are normalised so the ``/fail`` and ``/start`` suffixes append cleanly. + """ + ping_url = (ping_url or "").strip() + if not ping_url: + return "" + if ping_url.startswith(("http://", "https://")): + return ping_url.rstrip("/") + return (base_url or "").rstrip("/") + "/" + ping_url.lstrip("/") + + +class HealthchecksClient: + """Pings a Healthchecks.io check on a throttle; safe to call every loop cycle.""" + + def __init__(self, enabled, ping_url, base_url, interval_seconds, + fail_on_node_down, clock=time.monotonic): + self.enabled = bool(enabled) + self.url = _resolve_ping_url(ping_url, base_url) + self.interval = max(0, int(interval_seconds or 0)) + self.fail_on_node_down = bool(fail_on_node_down) + self._clock = clock + self._last_ping = None # monotonic time of the last *successful* send + self._warned_misconfig = False + + if self.enabled and self.url: + logger.info( + "Healthchecks.io dead-man's switch enabled (ping every %ss%s).", + self.interval, + ", /fail on required-node-down" if self.fail_on_node_down else "", + ) + + @classmethod + def from_config(cls): + """Build a client from the module-level config (env-backed) values.""" + return cls( + enabled=HEALTHCHECKS_ENABLED, + ping_url=HEALTHCHECKS_PING_URL, + base_url=HEALTHCHECKS_BASE_URL, + interval_seconds=HEALTHCHECKS_INTERVAL_SEC, + fail_on_node_down=HEALTHCHECKS_FAIL_ON_NODE_DOWN, + ) + + @property + def active(self): + """True only when enabled *and* a usable ping URL is configured.""" + return self.enabled and bool(self.url) + + def _due(self, now): + """Whether enough time has passed since the last successful ping to send another.""" + if self._last_ping is None: + return True + return (now - self._last_ping) >= self.interval + + def ping(self, fail=False): + """Send one heartbeat (or ``/fail``) if due. Never raises. + + ``fail`` signals a required node is down; it sends ``/fail`` only when + ``fail_on_node_down`` is on, otherwise a plain success ping (liveness only). + + Returns ``True`` if a request was sent and accepted, else ``False`` (disabled, + misconfigured, throttled, or the request failed). + """ + if not self.enabled: + return False + if not self.url: + # Enabled but nothing to ping — surface the misconfig once, then stay quiet. + if not self._warned_misconfig: + logger.warning( + "Healthchecks enabled but no ping_url configured — not pinging. " + "Set healthchecks.ping_url in config.json." + ) + self._warned_misconfig = True + return False + + now = self._clock() + if not self._due(now): + return False + + endpoint = self.url + "/fail" if (fail and self.fail_on_node_down) else self.url + try: + requests.get(endpoint, timeout=_PING_TIMEOUT_SEC) + # Advance the throttle only on success so a transient outage keeps retrying. + self._last_ping = now + return True + except requests.RequestException as e: + # Offline / Tor-only / endpoint hiccup: the whole point is to survive these + # silently — Healthchecks.io will alert on the missed ping. DEBUG, never noise. + logger.debug("Healthchecks ping failed (offline?): %s", e) + return False + except Exception as e: # pragma: no cover - defensive; never break the loop + logger.debug("Healthchecks unexpected error: %s", e) + return False diff --git a/build/dashboard/tests/service/test_data_service.py b/build/dashboard/tests/service/test_data_service.py index 304c681..7f3f901 100644 --- a/build/dashboard/tests/service/test_data_service.py +++ b/build/dashboard/tests/service/test_data_service.py @@ -565,6 +565,80 @@ async def test_run_nonblocking_tari_releases_and_stays_operational(self): assert svc.latest_data["global_sync"] is False assert svc.latest_data["tari_syncing_passive"] is True + async def _run_one_iteration(self, svc, monero_sync, tari_sync): + """Drive a single loop iteration with the given per-node sync signals.""" + worker_client = MagicMock() + worker_client.get_stats = AsyncMock(return_value={}) + tari_client = MagicMock() + tari_client.get_sync_status = AsyncMock(return_value=tari_sync) + tari_client.close = AsyncMock() + + with patch.object(ds_mod, "ClientSession", _FakeClientSession), \ + patch.object(ds_mod, "XMRigWorkerClient", return_value=worker_client), \ + patch.object(ds_mod, "TariClient", return_value=tari_client), \ + patch.object(ds_mod, "get_stratum_stats", return_value=({}, [])), \ + patch.object(ds_mod, "get_network_stats", return_value={"height": 100}), \ + patch.object(ds_mod, "get_tari_stats", return_value={"active": True, "status": "OK", "height": 3}), \ + patch.object(ds_mod, "get_p2pool_stats", return_value={"pool": {"last_share_time": 0, "difficulty": 0}}), \ + patch.object(ds_mod, "get_monero_sync_status", AsyncMock(return_value=monero_sync)), \ + patch.object(ds_mod, "get_disk_usage", return_value={}), \ + patch.object(ds_mod, "get_hugepages_status", return_value=("Enabled", "ok", "1/2")), \ + patch.object(ds_mod, "get_memory_usage", return_value={}), \ + patch.object(ds_mod, "get_load_average", return_value="0"), \ + patch.object(ds_mod, "get_cpu_usage", return_value="0%"), \ + patch("asyncio.sleep", AsyncMock(side_effect=StopAsyncIteration)): + with pytest.raises(StopAsyncIteration): + await svc.run() + + async def test_healthchecks_pinged_when_healthy(self): + # Both nodes reachable & synced → a plain success ping (fail=False) each cycle. + svc, sm, proxy = _make_service() + proxy.get_workers.return_value = {"workers": []} + svc.healthchecks = MagicMock() + svc.healthchecks.enabled = True + svc.healthchecks.ping.return_value = True + + await self._run_one_iteration( + svc, + monero_sync={"is_syncing": False, "reachable": True, "percent": 100, "current": 100, "target": 100}, + tari_sync={"is_syncing": False, "reachable": True}, + ) + svc.healthchecks.ping.assert_called_once_with(fail=False) + + async def test_healthchecks_fail_when_required_node_down(self): + # A debounced-down required node (monerod) → /fail signal (fail=True). + svc, sm, proxy = _make_service() + proxy.get_workers.return_value = {"workers": []} + svc.healthchecks = MagicMock() + svc.healthchecks.enabled = True + svc.healthchecks.ping.return_value = True + # Force the debounced node-health verdict to DOWN for this cycle. + svc.monero_health = MagicMock() + svc.monero_health.update.return_value = True + svc.tari_health = MagicMock() + svc.tari_health.update.return_value = False + + await self._run_one_iteration( + svc, + monero_sync={"is_syncing": False, "reachable": False}, + tari_sync={"is_syncing": False, "reachable": True}, + ) + svc.healthchecks.ping.assert_called_once_with(fail=True) + + async def test_healthchecks_not_pinged_when_disabled(self): + # Default: the disabled client is never invoked from the loop (no worker thread). + svc, sm, proxy = _make_service() + proxy.get_workers.return_value = {"workers": []} + svc.healthchecks = MagicMock() + svc.healthchecks.enabled = False + + await self._run_one_iteration( + svc, + monero_sync={"is_syncing": False, "reachable": True}, + tari_sync={"is_syncing": False, "reachable": True}, + ) + svc.healthchecks.ping.assert_not_called() + async def test_iteration_survives_collector_error(self): svc, sm, proxy = _make_service() worker_client = MagicMock() diff --git a/build/dashboard/tests/service/test_healthchecks.py b/build/dashboard/tests/service/test_healthchecks.py new file mode 100644 index 0000000..61eb89c --- /dev/null +++ b/build/dashboard/tests/service/test_healthchecks.py @@ -0,0 +1,173 @@ +"""Tests for the Healthchecks.io dead-man's-switch client (Issue #79).""" +import logging +from unittest.mock import patch + +import requests + +import mining_dashboard.service.healthchecks as hc_mod +from mining_dashboard.service.healthchecks import HealthchecksClient, _resolve_ping_url + + +class _Clock: + """A controllable monotonic clock for throttle tests.""" + + def __init__(self, t=1000.0): + self.t = t + + def __call__(self): + return self.t + + +def _client(clock=None, **overrides): + cfg = dict( + enabled=True, + ping_url="https://hc-ping.com/abc", + base_url="https://hc-ping.com", + interval_seconds=60, + fail_on_node_down=True, + ) + cfg.update(overrides) + return HealthchecksClient(clock=clock or _Clock(), **cfg) + + +class TestResolvePingUrl: + def test_full_https_url_used_as_is(self): + assert _resolve_ping_url("https://hc-ping.com/uuid", "https://hc-ping.com") == \ + "https://hc-ping.com/uuid" + + def test_full_http_url_used_as_is(self): + assert _resolve_ping_url("http://hc.local/ping/uuid", "https://hc-ping.com") == \ + "http://hc.local/ping/uuid" + + def test_trailing_slash_stripped(self): + assert _resolve_ping_url("https://hc-ping.com/uuid/", "https://hc-ping.com") == \ + "https://hc-ping.com/uuid" + + def test_bare_uuid_joined_with_default_base(self): + assert _resolve_ping_url("abc-123", "https://hc-ping.com") == "https://hc-ping.com/abc-123" + + def test_bare_uuid_joined_with_selfhosted_base(self): + # base_url override is how a self-hosted instance is supported with a bare uuid. + assert _resolve_ping_url("abc-123", "https://hc.example.com/ping/") == \ + "https://hc.example.com/ping/abc-123" + + def test_blank_and_none_resolve_to_empty(self): + assert _resolve_ping_url("", "https://hc-ping.com") == "" + assert _resolve_ping_url(" ", "https://hc-ping.com") == "" + assert _resolve_ping_url(None, "https://hc-ping.com") == "" + + +class TestActive: + def test_active_requires_enabled_and_url(self): + assert _client().active is True + assert _client(enabled=False).active is False + assert _client(ping_url="", base_url="https://hc-ping.com").active is False + + +class TestPingDisabledOrUnconfigured: + def test_disabled_is_a_noop(self): + c = _client(enabled=False) + with patch.object(hc_mod.requests, "get") as get: + assert c.ping() is False + get.assert_not_called() + + def test_enabled_without_url_warns_once(self, caplog): + c = _client(ping_url="", base_url="https://hc-ping.com") + with patch.object(hc_mod.requests, "get") as get, \ + caplog.at_level(logging.WARNING, logger="Healthchecks"): + assert c.ping() is False + assert c.ping() is False + get.assert_not_called() + warnings = [r for r in caplog.records if r.levelno == logging.WARNING] + assert len(warnings) == 1 # warned once, then stayed quiet + + +class TestPingSuccess: + def test_healthy_ping_hits_the_url(self): + c = _client() + with patch.object(hc_mod.requests, "get") as get: + assert c.ping() is True + get.assert_called_once() + assert get.call_args.args[0] == "https://hc-ping.com/abc" + + def test_fail_signal_appends_fail(self): + c = _client() + with patch.object(hc_mod.requests, "get") as get: + assert c.ping(fail=True) is True + assert get.call_args.args[0] == "https://hc-ping.com/abc/fail" + + def test_fail_signal_ignored_when_disabled_for_node_down(self): + # signal_fail_on_node_down off → a node-down still sends a *success* ping (liveness only). + c = _client(fail_on_node_down=False) + with patch.object(hc_mod.requests, "get") as get: + assert c.ping(fail=True) is True + assert get.call_args.args[0] == "https://hc-ping.com/abc" + + +class TestThrottle: + def test_second_immediate_ping_is_throttled(self): + clock = _Clock(1000.0) + c = _client(clock=clock, interval_seconds=60) + with patch.object(hc_mod.requests, "get") as get: + assert c.ping() is True # first ping goes out + assert c.ping() is False # within the interval → skipped + get.assert_called_once() + + def test_ping_again_after_interval_elapses(self): + clock = _Clock(1000.0) + c = _client(clock=clock, interval_seconds=60) + with patch.object(hc_mod.requests, "get") as get: + assert c.ping() is True + clock.t += 61 # interval elapsed + assert c.ping() is True + assert get.call_count == 2 + + def test_zero_interval_pings_every_call(self): + clock = _Clock(1000.0) + c = _client(clock=clock, interval_seconds=0) + with patch.object(hc_mod.requests, "get") as get: + assert c.ping() is True + assert c.ping() is True + assert get.call_count == 2 + + +class TestPingFailsSilently: + def test_network_error_is_swallowed_and_retries(self, caplog): + clock = _Clock(1000.0) + c = _client(clock=clock, interval_seconds=60) + with patch.object(hc_mod.requests, "get", + side_effect=requests.exceptions.ConnectionError("offline")) as get, \ + caplog.at_level(logging.DEBUG, logger="Healthchecks"): + assert c.ping() is False + # Throttle clock did NOT advance on failure → the very next call retries immediately. + assert c.ping() is False + assert get.call_count == 2 + # No noisy WARNING/ERROR — offline is expected and logged at debug only (#59 discipline). + assert not [r for r in caplog.records if r.levelno >= logging.WARNING] + + def test_success_after_failure_advances_throttle(self): + clock = _Clock(1000.0) + c = _client(clock=clock, interval_seconds=60) + with patch.object(hc_mod.requests, "get", + side_effect=[requests.exceptions.ConnectionError("x"), None]): + assert c.ping() is False # failed, no throttle advance + assert c.ping() is True # retried immediately, succeeded → throttle set + assert c.ping() is False # now throttled + + +class TestFromConfig: + def test_defaults_to_disabled(self): + # Real config defaults: feature is off out of the box. + assert HealthchecksClient.from_config().enabled is False + + def test_reads_config_values(self): + with patch.object(hc_mod, "HEALTHCHECKS_ENABLED", True), \ + patch.object(hc_mod, "HEALTHCHECKS_PING_URL", "https://hc-ping.com/zzz"), \ + patch.object(hc_mod, "HEALTHCHECKS_BASE_URL", "https://hc-ping.com"), \ + patch.object(hc_mod, "HEALTHCHECKS_INTERVAL_SEC", 120), \ + patch.object(hc_mod, "HEALTHCHECKS_FAIL_ON_NODE_DOWN", False): + c = HealthchecksClient.from_config() + assert c.enabled is True + assert c.url == "https://hc-ping.com/zzz" + assert c.interval == 120 + assert c.fail_on_node_down is False diff --git a/config.advanced.example.json b/config.advanced.example.json index 0bd0026..de4d3e8 100644 --- a/config.advanced.example.json +++ b/config.advanced.example.json @@ -48,5 +48,13 @@ "timezone": "auto", "data_dir": "auto", "tari_required": true + }, + + "healthchecks": { + "enabled": false, + "ping_url": "", + "base_url": "https://hc-ping.com", + "interval_seconds": 60, + "signal_fail_on_node_down": true } } diff --git a/docker-compose.yml b/docker-compose.yml index 782fb42..7da1073 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -283,6 +283,16 @@ services: # miner waits for Tari's sync, and whether a Tari-only sync takes over the dashboard # (Issue #51). Set dashboard.tari_required:false to make Tari non-blocking. - TARI_REQUIRED=${TARI_REQUIRED:-true} + # Healthchecks.io dead-man's switch (#79). Optional, default off: when enabled the + # dashboard pings a unique URL each loop, so if the host dies the pings stop and + # Healthchecks.io alerts externally — the one failure mode an in-stack notifier can't + # report. Manual setup (paste the ping URL); see docs/monitoring.md. The dashboard runs + # network_mode: host, so it reaches hc-ping.com over the host's network. + - HEALTHCHECKS_ENABLED=${HEALTHCHECKS_ENABLED:-false} + - HEALTHCHECKS_PING_URL=${HEALTHCHECKS_PING_URL:-} + - HEALTHCHECKS_BASE_URL=${HEALTHCHECKS_BASE_URL:-https://hc-ping.com} + - HEALTHCHECKS_INTERVAL_SECONDS=${HEALTHCHECKS_INTERVAL_SECONDS:-60} + - HEALTHCHECKS_FAIL_ON_NODE_DOWN=${HEALTHCHECKS_FAIL_ON_NODE_DOWN:-true} # --- Docker Socket Proxy (read-only) --- # Read-only window onto the Docker API for the dashboard's container stats/logs. diff --git a/docs/README.md b/docs/README.md index b30dd61..c436441 100644 --- a/docs/README.md +++ b/docs/README.md @@ -14,6 +14,7 @@ deeper on individual topics once you're up and running. | [Hardware Requirements](hardware.md) | Minimum vs. recommended specs for the **stack host** — CPU, RAM, disk, network, OS — plus lighter-footprint options. (Miner hardware lives in [RigForge](https://github.com/p2pool-starter-stack/rigforge).) | | [Configuration](configuration.md) | Every `config.json` key and default, applying changes safely, **reusing an existing node via data directories**, and connecting to a **remote Monero node**. | | [The Dashboard](dashboard.md) | **Sync Mode**, the live operational view, and how to read every panel. | +| [Monitoring & Alerting](monitoring.md) | Optional **Healthchecks.io dead-man's switch** — get alerted when your host goes down (power loss, crash), even when it can't tell you itself. | | [Connecting Miners](workers.md) | Pointing any existing rig at the stack, plus [RigForge](https://github.com/p2pool-starter-stack/rigforge) for setting up new miners. | | [Architecture](architecture.md) | The nine services, how they fit together, the privacy model, and the algorithmic XvB switching engine. | | [Operations & Maintenance](operations.md) | The full `pithead` command reference, upgrades, backups, and troubleshooting. | @@ -26,4 +27,5 @@ deeper on individual topics once you're up and running. - **Will my machine handle it?** → [Hardware Requirements](hardware.md) - **Change a setting?** → [Configuration › Changing settings later](configuration.md#changing-settings-later) - **Already have a synced Monero node?** → [Configuration › Reusing an existing node](configuration.md#reusing-an-existing-node) +- **Want to be alerted if the host dies?** → [Monitoring & Alerting](monitoring.md) - **Something's not working?** → [Operations › Troubleshooting](operations.md#troubleshooting) diff --git a/docs/configuration.md b/docs/configuration.md index 535000a..8964bb9 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -89,6 +89,11 @@ plain HTTP, edit `config.json` and run `./pithead apply`. | `dashboard.timezone` | `auto` | Timezone for the dashboard's timestamps and charts. `auto` = **the host machine's timezone** (auto-detected, falling back to `Etc/UTC`); set an IANA name (e.g. `America/Chicago`) to override. | | `dashboard.data_dir` | `auto` | Where the dashboard's database lives. `auto` = `./data/dashboard`. | | `dashboard.tari_required` | `true` | How much a Tari problem holds up the rest of the stack. Monero is **required** to mine, so its behavior isn't configurable: a monerod outage always rejects workers (stops `xmrig-proxy` so miners **fail over to their backup pools**), and the miner is always held until monerod finishes syncing. Tari is **only needed for merge mining**, so this one flag decides how much it blocks. **`true` (default):** a Tari outage also rejects workers, the miner waits for Tari's initial sync too, and a Tari-only (re)sync shows the full-screen Sync view. **`false` (non-blocking):** keep mining Monero through a Tari outage, start mining as soon as Monero is synced (Tari finishes in the background), and keep the normal dashboard — with a `Tari syncing` indicator — instead of the takeover screen. | +| `healthchecks.enabled` | `false` | Turn on the optional [Healthchecks.io dead-man's switch](monitoring.md) — get alerted when your host stops responding (power loss, crash), even when it can't alert you itself. When off, nothing pings. | +| `healthchecks.ping_url` | _(blank)_ | The ping URL from Healthchecks.io (e.g. `https://hc-ping.com/`). Required when `enabled`. Treated as a secret — stored in the owner-only `.env`. See [Monitoring & Alerting](monitoring.md). | +| `healthchecks.base_url` | `https://hc-ping.com` | Only used when `ping_url` is a bare uuid rather than a full URL; override it to point at a **self-hosted** Healthchecks instance. | +| `healthchecks.interval_seconds` | `60` | Minimum seconds between pings (the loop pings at most this often). Keep your Healthchecks **period + grace** comfortably above it. | +| `healthchecks.signal_fail_on_node_down` | `true` | Send a `/fail` (red the check immediately) while a required node is down — same condition as worker rejection above. `false` = plain liveness (only a dead host trips it). | --- diff --git a/docs/monitoring.md b/docs/monitoring.md new file mode 100644 index 0000000..0b099b8 --- /dev/null +++ b/docs/monitoring.md @@ -0,0 +1,197 @@ +# Monitoring & alerting + +Pithead can ping an external **dead-man's switch** so you find out when your mining host +goes down — even when it can't tell you itself. + +## Why an *external* monitor? + +If the stack hits a problem while the machine is still alive — a node falls out of sync, a +container crashes — it can notice and react locally. But the failures that hurt most are the +ones that kill the whole host: **power loss, a kernel panic, a dead NIC, the box hanging**. A +dead machine can't send its own "I'm down" alert. + +[Healthchecks.io](https://healthchecks.io) solves this by inverting the logic. The stack +periodically pings a unique URL; **Healthchecks.io alerts you when the pings *stop***. Because +the alert is evaluated on Healthchecks.io's servers, it survives the very outage you want to +catch. It's a *dead-man's switch*: silence is the alarm. + +This is **off by default** and entirely optional — when disabled, nothing pings and nothing is +logged. + +--- + +## Setup (about 5 minutes) + +### 1. Create a check on Healthchecks.io + +1. Sign up at [healthchecks.io](https://healthchecks.io) — the **free tier** (20 checks, 3 + months of history) is plenty for one stack. (Prefer to self-host? See + [Self-hosting](#self-hosting-your-own-instance) below.) +2. Create a new check. Name it something like `pithead`. +3. Set its schedule: + - **Period** — how often Healthchecks.io *expects* a ping. Set this comfortably **above** + your ping interval (default 60s) so a single missed cycle — e.g. a quick dashboard + restart — doesn't trip a false alarm. **5 minutes** is a sensible starting point. + - **Grace** — how long after a missed period before you're alerted. **5–10 minutes** is + reasonable; shorter means faster alerts but more false positives on brief blips. +4. Copy the check's **ping URL** — it looks like `https://hc-ping.com/`. + +### 2. Choose where alerts go + +On the check's **Integrations** tab, point it at however you want to be notified — **email**, +**Telegram**, Slack, Discord, a webhook, and more. If you already use Telegram for other +alerts, you can route Healthchecks.io to the **same** Telegram chat, so host-down alerts and +in-stack events land in one place. + +### 3. Paste the ping URL into `config.json` + +Add a `healthchecks` block (see [`config.advanced.example.json`](../config.advanced.example.json)): + +```json +{ + "healthchecks": { + "enabled": true, + "ping_url": "https://hc-ping.com/your-unique-uuid-here" + } +} +``` + +`enabled` and `ping_url` are all you need; everything else has a sensible default. + +### 4. Apply + +```bash +./pithead apply +``` + +`apply` previews the change and recreates the dashboard container. The ping URL is treated as +a secret — it's stored in the owner-only `.env`, never echoed by `apply`, and never logged. + +That's it. Within a cycle or two the check on Healthchecks.io turns green. Kill the stack (or +the whole host) and, once the period + grace elapses, Healthchecks.io alerts you. + +--- + +## How it works + +- The dashboard's existing data-collection loop sends the ping each cycle, so it reuses the + process that's already running — no extra container or daemon. If the host dies, the + dashboard dies with it, the pings stop, and the alert fires. If only the dashboard container + restarts briefly, the **grace period** absorbs the gap. +- **Health-aware (optional).** With `signal_fail_on_node_down` on (the default), the stack + sends a `/fail` signal — turning the check red immediately — whenever a *required* node is + down: monerod always, and Tari only when `dashboard.tari_required` is `true` (the same + condition that fails miners over to their backup pools, see + [Configuration](configuration.md#configuration-reference)). So the check catches a + degraded-but-alive stack too, not just a dead host. Set it to `false` for plain liveness + (only a dead host trips the alert). +- **Fails silently.** A ping that can't get out — you're offline, or running + [Tor-only](architecture.md) without clearnet — is ignored quietly (it's logged at debug + level only). Healthchecks.io will alert on the missed ping regardless, which is the point. + +--- + +## Configuration reference + +| Key | Default | Description | +|---|---|---| +| `healthchecks.enabled` | `false` | Master switch. When off, the stack never pings and logs nothing. | +| `healthchecks.ping_url` | _(blank)_ | The ping URL from Healthchecks.io, e.g. `https://hc-ping.com/`. A bare uuid/slug is also accepted and is joined onto `base_url`. Treated as a secret (stored in the owner-only `.env`). | +| `healthchecks.base_url` | `https://hc-ping.com` | Only used when `ping_url` is a bare uuid (not a full URL). Override it to point at a [self-hosted](#self-hosting-your-own-instance) instance. | +| `healthchecks.interval_seconds` | `60` | Minimum seconds between pings. The loop runs every 30s, so a value below that just pings every cycle. Keep your Healthchecks **period + grace** well above this. | +| `healthchecks.signal_fail_on_node_down` | `true` | Send `/fail` (red the check now) while a required node is down. `false` = plain liveness only. | + +> Auto-provisioning the check via the Healthchecks.io Management API (so you wouldn't have to +> copy the URL by hand) was considered but deliberately left out: it would mean storing a +> powerful API key in your config. Manual setup keeps it simple, secret-free, and works +> equally well with a self-hosted instance. + +--- + +## Self-hosting your own instance + +Healthchecks is open source and [self-hostable](https://healthchecks.io/docs/self_hosted/). To +point Pithead at your own instance, just paste its full ping URL into `ping_url` — it already +carries your host, so nothing else is needed: + +```json +{ + "healthchecks": { + "enabled": true, + "ping_url": "https://hc.example.com/ping/your-unique-uuid-here" + } +} +``` + +Alternatively, store the bare uuid in `ping_url` and set `base_url` to your instance's ping +prefix (e.g. `https://hc.example.com/ping`). + +--- + +## Privacy note + +Pinging the hosted **hc-ping.com** happens over **clearnet**, which reveals your host's IP +address to Healthchecks.io — separate from the Monero/Tari traffic the stack routes over +[Tor](architecture.md). If that matters to you, **self-host** Healthchecks on infrastructure +you control (ideally reachable as an onion service or over a VPN). This feature is opt-in and +off by default precisely because it's a clearnet beacon. + +--- + +## Optional: a host-level ping, independent of the dashboard + +Pinging from the dashboard loop covers the big failure modes (host death, dashboard crash). If +you want a liveness signal that doesn't depend on the dashboard at all — handy on a dedicated +mining box — add a small **systemd timer** on the host that curls the same (or a second) ping +URL: + +```ini +# /etc/systemd/system/pithead-heartbeat.service +[Unit] +Description=Ping Healthchecks.io (host heartbeat) +[Service] +Type=oneshot +ExecStart=/usr/bin/curl -fsS -m 10 --retry 3 https://hc-ping.com/your-unique-uuid-here +``` + +```ini +# /etc/systemd/system/pithead-heartbeat.timer +[Unit] +Description=Run the Healthchecks.io heartbeat every minute +[Timer] +OnBootSec=1min +OnUnitActiveSec=1min +[Install] +WantedBy=timers.target +``` + +```bash +sudo systemctl enable --now pithead-heartbeat.timer +``` + +Use a **separate** check for the host timer if you want to tell "the host is up" apart from +"the mining stack is up." + +--- + +## Verifying & troubleshooting + +- **The check never goes green.** Confirm `enabled` is `true` and you ran `./pithead apply`. + Check the dashboard logs (`./pithead logs dashboard`) for a `Healthchecks.io dead-man's + switch enabled` line at startup; if you see `Healthchecks enabled but no ping_url + configured`, the URL is missing. Ping failures themselves are logged at debug level only. +- **Test it end to end.** Stop the stack (`./pithead stop`) and wait for the period + grace to + elapse — you should get the alert. Start it again and the check recovers. +- **Too many false alarms.** Increase the **period** and/or **grace** on Healthchecks.io, or + raise `interval_seconds` if you've set it very low. + +--- + +## See also + +- [Configuration](configuration.md) — the full `config.json` reference, including + `dashboard.tari_required`, which governs the `/fail` signal. +- [Architecture](architecture.md) — the privacy model and the Tor routing this feature sits + outside of. +- [Operations & Maintenance](operations.md) — the `pithead` command reference, logs, and + troubleshooting. diff --git a/pithead b/pithead index 59ad23b..0033728 100755 --- a/pithead +++ b/pithead @@ -1301,6 +1301,17 @@ render_env() { local tari_required tari_required=$(jq -r 'if .dashboard.tari_required != null then .dashboard.tari_required | tostring else "true" end' "$CONFIG_FILE") + # Healthchecks.io dead-man's switch (#79). Optional external liveness monitor; default off. + # Manual mode only — the operator pastes the ping URL Healthchecks.io shows them. The ping + # URL is a capability secret, so it lives in the owner-only .env (chmod 600 below) alongside + # the other secrets, never in a world-readable file. See docs/monitoring.md. + local hc_enabled hc_ping_url hc_base_url hc_interval hc_fail_on_down + hc_enabled=$(jq -r 'if .healthchecks.enabled != null then .healthchecks.enabled | tostring else "false" end' "$CONFIG_FILE") + hc_ping_url=$(jq -r '.healthchecks.ping_url // empty' "$CONFIG_FILE") + hc_base_url=$(jq -r '.healthchecks.base_url // "https://hc-ping.com"' "$CONFIG_FILE") + hc_interval=$(jq -r '.healthchecks.interval_seconds // 60' "$CONFIG_FILE") + hc_fail_on_down=$(jq -r 'if .healthchecks.signal_fail_on_node_down != null then .healthchecks.signal_fail_on_node_down | tostring else "true" end' "$CONFIG_FILE") + # Tari memory cap (#55). Tari officially needs only a few GB (min 4 GB host, 8 GB+ recommended), # but its memory grows unbounded over time — one 32 GB host was seen at ~11 GB while staying # healthy. Uncapped, that growth can OOM the whole host on small machines. So the cap is a SAFETY @@ -1365,6 +1376,11 @@ XVB_ENABLED=$xvb_enabled XVB_DONATION_LEVEL=$xvb_donation_level TARI_REQUIRED=$tari_required TARI_MEM_LIMIT=$tari_mem_limit +HEALTHCHECKS_ENABLED=$hc_enabled +HEALTHCHECKS_PING_URL=$hc_ping_url +HEALTHCHECKS_BASE_URL=$hc_base_url +HEALTHCHECKS_INTERVAL_SECONDS=$hc_interval +HEALTHCHECKS_FAIL_ON_NODE_DOWN=$hc_fail_on_down P2POOL_URL=172.28.0.28:3333 PROXY_API_PORT=3344 PROXY_AUTH_TOKEN=$PROXY_AUTH_TOKEN @@ -1678,6 +1694,13 @@ describe_change() { msg="Dashboard hostname: $old → $new." ;; MONERO_PREP_THREADS) msg="Monero block-prep threads: $old → $new." ;; + HEALTHCHECKS_ENABLED) + msg="Healthchecks.io dead-man's switch → $([ "$new" == "true" ] && echo ENABLED || echo disabled)." ;; + HEALTHCHECKS_PING_URL) + # The ping URL is a capability secret — report the change without printing it. + msg="Healthchecks.io ping URL updated." ;; + HEALTHCHECKS_BASE_URL|HEALTHCHECKS_INTERVAL_SECONDS|HEALTHCHECKS_FAIL_ON_NODE_DOWN) + msg="Healthchecks.io setting ($key): $old → $new." ;; *) msg="$key: $old → $new." ;; esac diff --git a/tests/stack/run.sh b/tests/stack/run.sh index 88216ae..4b31cd8 100755 --- a/tests/stack/run.sh +++ b/tests/stack/run.sh @@ -108,6 +108,14 @@ assert_contains "wallet is DEST" "$(run_sourced "$SANDBOX" describe_change M assert_contains "xvb url is INFO" "$(run_sourced "$SANDBOX" describe_change XVB_POOL_URL a b)" "INFO" assert_contains "data_dir is DEST" "$(run_sourced "$SANDBOX" describe_change MONERO_DATA_DIR /a /b)" "DEST" assert_contains "tari mem is INFO" "$(run_sourced "$SANDBOX" describe_change TARI_MEM_LIMIT 2048m 4g)" "INFO" +# Healthchecks.io (#79): toggling is INFO, and the ping URL (a capability secret) must NOT be +# echoed in the apply preview — only the fact that it changed. +assert_contains "hc enable is INFO" "$(run_sourced "$SANDBOX" describe_change HEALTHCHECKS_ENABLED false true)" "INFO" +assert_contains "hc enable says ENABLED" "$(run_sourced "$SANDBOX" describe_change HEALTHCHECKS_ENABLED false true)" "ENABLED" +case "$(run_sourced "$SANDBOX" describe_change HEALTHCHECKS_PING_URL old https://hc-ping.com/SECRET)" in + *SECRET*) bad "hc ping_url not printed" "leaked the ping URL into the preview" ;; + *) ok "hc ping_url not printed" ;; +esac echo "== unit: env helpers ==" printf 'A=1\nB=two\nPROXY_AUTH_TOKEN=keep=me\n' > "$SANDBOX/old.env" @@ -299,6 +307,22 @@ printf '{ "monero": {"mode":"local","wallet_address":"%s","node_username":"u","n out="$(cd "$V" && DOCKER_LOG="$DOCKER_LOG" PATH="$V/bin:$PATH" ./pithead apply -y 2>&1)" assert_eq "tari mem_limit explicit propagated" "$(run_sourced "$V" env_get_file "$V/.env" TARI_MEM_LIMIT)" "3072m" +# Healthchecks.io (#79): absent => disabled with the hosted base_url default. +seed_env +printf '{ "monero": {"mode":"local","wallet_address":"%s","node_username":"u","node_password":"p"}, "tari":{"wallet_address":"T"}, "p2pool":{"pool":"mini"}, "dashboard":{"secure":false,"host":"box.lan"} }\n' "$WALLET" > "$V/config.json" +out="$(cd "$V" && DOCKER_LOG="$DOCKER_LOG" PATH="$V/bin:$PATH" ./pithead apply -y 2>&1)" +assert_eq "healthchecks default disabled" "$(run_sourced "$V" env_get_file "$V/.env" HEALTHCHECKS_ENABLED)" "false" +assert_eq "healthchecks default base_url" "$(run_sourced "$V" env_get_file "$V/.env" HEALTHCHECKS_BASE_URL)" "https://hc-ping.com" + +# An enabled config propagates the ping URL + tuning knobs verbatim to .env. +seed_env +printf '{ "monero": {"mode":"local","wallet_address":"%s","node_username":"u","node_password":"p"}, "tari":{"wallet_address":"T"}, "p2pool":{"pool":"mini"}, "dashboard":{"secure":false,"host":"box.lan"}, "healthchecks":{"enabled":true,"ping_url":"https://hc-ping.com/abc","interval_seconds":120,"signal_fail_on_node_down":false} }\n' "$WALLET" > "$V/config.json" +out="$(cd "$V" && DOCKER_LOG="$DOCKER_LOG" PATH="$V/bin:$PATH" ./pithead apply -y 2>&1)" +assert_eq "healthchecks enabled propagated" "$(run_sourced "$V" env_get_file "$V/.env" HEALTHCHECKS_ENABLED)" "true" +assert_eq "healthchecks ping_url propagated" "$(run_sourced "$V" env_get_file "$V/.env" HEALTHCHECKS_PING_URL)" "https://hc-ping.com/abc" +assert_eq "healthchecks interval propagated" "$(run_sourced "$V" env_get_file "$V/.env" HEALTHCHECKS_INTERVAL_SECONDS)" "120" +assert_eq "healthchecks fail-on-down propagated" "$(run_sourced "$V" env_get_file "$V/.env" HEALTHCHECKS_FAIL_ON_NODE_DOWN)" "false" + echo "== black-box: local node creds auto-generated + persisted (#50) ==" # A local node with BLANK creds: apply must generate them, write them into .env AND back into # config.json, and keep them stable on a second apply (don't regenerate every run).