p2pool-starter-stack · VijitSingh97 · Jun 4, 2026 · Jun 4, 2026
@@ -13,6 +13,15 @@ per the process in [`docs/releasing.md`](docs/releasing.md).
 
 ### Added
 
+- **Healthchecks.io dead-man's switch** (`healthchecks.*` in `config.json`, **default off**):
+  an optional external liveness monitor. When enabled, the dashboard loop pings a unique
+  Healthchecks.io URL every cycle; if the host dies (power loss, kernel panic, NIC death) the
+  pings stop and Healthchecks.io alerts you — the one failure mode an in-stack notifier can't
+  report from a dead machine. Optionally sends `/fail` while a required node is down
+  (`signal_fail_on_node_down`, on by default), supports self-hosted instances via `base_url`,
+  and fails silently when offline / Tor-only. Manual setup (paste the ping URL; the URL is
+  stored as a secret in the owner-only `.env`). See [`docs/monitoring.md`](docs/monitoring.md)
+  (#79).
 - Dashboard header shows the host's **IP address** next to the hostname when the configured
   `dashboard.host` is a name, as `hostname @ ip` (e.g. `pithead.local @ 192.168.1.42`), so you can still reach the
   dashboard when the hostname doesn't resolve from your phone or another machine on the LAN. The

@@ -112,6 +112,38 @@
 NODE_DOWN_AFTER_SEC = int(os.environ.get("NODE_DOWN_AFTER_SEC", 90))
 NODE_RECOVERY_AFTER_SEC = int(os.environ.get("NODE_RECOVERY_AFTER_SEC", 60))
 
+# --- Healthchecks.io dead-man's switch (Issue #79) ---
+# Optional external liveness monitor. When enabled, the dashboard loop pings a unique URL
+# every cycle; if the whole host dies (power loss, kernel panic, NIC death) the dashboard
+# dies with it, the pings stop, and Healthchecks.io alerts the operator on the *absence* of
+# a ping — the one failure mode an in-stack notifier (#45) structurally can't report.
+# Default OFF: with HEALTHCHECKS_ENABLED unset nothing ever pings and there are no errors.
+HEALTHCHECKS_ENABLED = os.environ.get("HEALTHCHECKS_ENABLED", "false").strip().lower() == "true"
+
+# Manual mode (MVP): paste the full ping URL Healthchecks.io shows you, e.g.
+# https://hc-ping.com/<uuid>. A bare uuid/slug is also accepted and is joined onto
+# HEALTHCHECKS_BASE_URL — which is what `base_url` is for: pointing at a *self-hosted*
+# Healthchecks instance instead of the hosted hc-ping.com. No API key is stored (model B
+# auto-provisioning is intentionally out of scope).
+HEALTHCHECKS_PING_URL = os.environ.get("HEALTHCHECKS_PING_URL", "").strip()
+HEALTHCHECKS_BASE_URL = os.environ.get("HEALTHCHECKS_BASE_URL", "https://hc-ping.com").strip()
+
+# How often to ping. The loop runs every UPDATE_INTERVAL, so this is a throttle floor — a
+# value below UPDATE_INTERVAL just pings every cycle. Set your Healthchecks period + grace
+# comfortably above this so a single missed cycle (e.g. a dashboard restart) doesn't alert.
+try:
+    HEALTHCHECKS_INTERVAL_SEC = int(os.environ.get("HEALTHCHECKS_INTERVAL_SECONDS", "60"))
+except ValueError:
+    HEALTHCHECKS_INTERVAL_SEC = 60
+if HEALTHCHECKS_INTERVAL_SEC < 0:
+    HEALTHCHECKS_INTERVAL_SEC = 0
+
+# When true, send Healthchecks a `/fail` (instead of a success ping) while a *required* node
+# is down — i.e. the same condition that rejects workers in #31 (monerod always; Tari only
+# when TARI_REQUIRED). This makes the check health-aware: it goes red on a degraded-but-alive
+# stack, not just a dead host. Set false for plain liveness (only host death trips it).
+HEALTHCHECKS_FAIL_ON_NODE_DOWN = os.environ.get("HEALTHCHECKS_FAIL_ON_NODE_DOWN", "true").strip().lower() == "true"
+
 # --- Monero Configuration ---
 # Used to determine if the node is local (Docker) or remote
 MONERO_NODE_HOST = os.environ.get("MONERO_NODE_HOST", "172.28.0.26")

@@ -16,6 +16,7 @@
 from mining_dashboard.collector.logs import get_monero_sync_status
 from mining_dashboard.collector.system import get_disk_usage, get_hugepages_status, get_memory_usage, get_load_average, get_cpu_usage
 from mining_dashboard.service.node_health import NodeHealthMonitor
+from mining_dashboard.service.healthchecks import HealthchecksClient
 
 logger = logging.getLogger("DataService")
 
@@ -206,6 +207,11 @@ def __init__(self, state_manager, proxy_client, xvb_client):
         self.docker_control = DockerControl()
         self.monero_health = NodeHealthMonitor()
         self.tari_health = NodeHealthMonitor()
+
+        # Healthchecks.io dead-man's switch (Issue #79). Disabled by default — when off this is
+        # a no-op. When on, each cycle pings a unique URL; the alert fires externally on the
+        # *absence* of a ping, so it survives a host death the in-stack notifier can't report.
+        self.healthchecks = HealthchecksClient.from_config()
         # True while we've stopped the proxy to reject workers. Persisted in the snapshot so
         # a dashboard restart mid-outage still readmits workers once the node recovers.
         self.workers_rejected = False
@@ -478,6 +484,16 @@ async def run(self):
                     snapshot_data.pop("shares", None)
                     await asyncio.to_thread(self.state_manager.save_snapshot, snapshot_data)
 
+                    # 6b. Healthchecks.io dead-man's switch (Issue #79). Ping each cycle so the
+                    # external monitor alerts on the *absence* of a ping if the host ever dies
+                    # (power loss, crash, NIC death). Send /fail instead while a *required* node
+                    # is down — the same predicate as #31's worker rejection: monerod always,
+                    # Tari only when required. The client throttles and fails silently; gate on
+                    # `enabled` so a disabled (default) stack never spawns the worker thread.
+                    if self.healthchecks.enabled:
+                        required_node_down = monero_down or (tari_down and TARI_REQUIRED)
+                        await asyncio.to_thread(self.healthchecks.ping, fail=required_node_down)
+
                     # 7. External API Sync (Throttled to every 10th iteration)
                     if iteration_count % 10 == 0:
                         real_xvb_stats = await asyncio.to_thread(self.xvb_client.get_stats)

@@ -0,0 +1,145 @@
+"""Healthchecks.io dead-man's-switch pinger (Issue #79).
+
+A thin, self-contained client the data loop calls once per cycle to ping a unique
+Healthchecks.io URL. The value is in what *stops* happening: if the host dies, the
+dashboard dies with it, the pings stop, and Healthchecks.io fires an alert on the absence
+of a ping — evaluated on *their* servers, so it survives the very outage (power loss, kernel
+panic, NIC death) an in-stack notifier can't report from a dead machine.
+
+Design notes:
+- **Default off.** A disabled client is a no-op: :meth:`ping` returns immediately, opens no
+  socket, and logs nothing.
+- **Fails silently.** A ping that can't reach the endpoint (offline, or a Tor-only host that
+  can't reach clearnet hc-ping.com) is logged at DEBUG only — never WARNING/ERROR — so the
+  log stays quiet, consistent with the stack's offline check discipline (#59). A blank
+  ``ping_url`` while *enabled* is a genuine misconfiguration and warns once.
+- **Throttled.** :data:`interval` is a floor between pings; the loop calls every cycle but we
+  only hit the network once per interval. The throttle clock only advances on a *successful*
+  send, so while offline we keep retrying every cycle rather than backing off.
+
+Manual setup only (MVP): the operator pastes the ping URL from Healthchecks.io. Auto-
+provisioning via the Management API (which would mean storing a powerful API key) is
+intentionally left out — see ``docs/monitoring.md``.
+"""
+import logging
+import time
+
+import requests
+
+from mining_dashboard.config.config import (
+    HEALTHCHECKS_ENABLED,
+    HEALTHCHECKS_PING_URL,
+    HEALTHCHECKS_BASE_URL,
+    HEALTHCHECKS_INTERVAL_SEC,
+    HEALTHCHECKS_FAIL_ON_NODE_DOWN,
+)
+
+logger = logging.getLogger("Healthchecks")
+
+# A ping is a tiny request; keep the timeout short so a hung endpoint can't stall the loop's
+# worker thread for long. Healthchecks.io recommends GET/HEAD/POST to the ping URL.
+_PING_TIMEOUT_SEC = 10
+
+
+def _resolve_ping_url(ping_url, base_url):
+    """Resolve the configured ping URL into a full success endpoint, or ``""`` if unset.
+
+    Two accepted shapes, so the same config works for hosted and self-hosted instances:
+
+    - A full ``http(s)://...`` URL (what Healthchecks.io shows you) is used as-is. This already
+      carries the host, so self-hosted is supported by pasting the self-hosted URL — no
+      ``base_url`` needed.
+    - A bare uuid/slug is joined onto ``base_url`` (default ``https://hc-ping.com``; override it
+      to point at a self-hosted instance, e.g. ``https://hc.example.com/ping``).
+
+    Trailing slashes are normalised so the ``/fail`` and ``/start`` suffixes append cleanly.
+    """
+    ping_url = (ping_url or "").strip()
+    if not ping_url:
+        return ""
+    if ping_url.startswith(("http://", "https://")):
+        return ping_url.rstrip("/")
+    return (base_url or "").rstrip("/") + "/" + ping_url.lstrip("/")
+
+
+class HealthchecksClient:
+    """Pings a Healthchecks.io check on a throttle; safe to call every loop cycle."""
+
+    def __init__(self, enabled, ping_url, base_url, interval_seconds,
+                 fail_on_node_down, clock=time.monotonic):
+        self.enabled = bool(enabled)
+        self.url = _resolve_ping_url(ping_url, base_url)
+        self.interval = max(0, int(interval_seconds or 0))
+        self.fail_on_node_down = bool(fail_on_node_down)
+        self._clock = clock
+        self._last_ping = None       # monotonic time of the last *successful* send
+        self._warned_misconfig = False
+
+        if self.enabled and self.url:
+            logger.info(
+                "Healthchecks.io dead-man's switch enabled (ping every %ss%s).",
+                self.interval,
+                ", /fail on required-node-down" if self.fail_on_node_down else "",
+            )
+
+    @classmethod
+    def from_config(cls):
+        """Build a client from the module-level config (env-backed) values."""
+        return cls(
+            enabled=HEALTHCHECKS_ENABLED,
+            ping_url=HEALTHCHECKS_PING_URL,
+            base_url=HEALTHCHECKS_BASE_URL,
+            interval_seconds=HEALTHCHECKS_INTERVAL_SEC,
+            fail_on_node_down=HEALTHCHECKS_FAIL_ON_NODE_DOWN,
+        )
+
+    @property
+    def active(self):
+        """True only when enabled *and* a usable ping URL is configured."""
+        return self.enabled and bool(self.url)
+
+    def _due(self, now):
+        """Whether enough time has passed since the last successful ping to send another."""
+        if self._last_ping is None:
+            return True
+        return (now - self._last_ping) >= self.interval
+
+    def ping(self, fail=False):
+        """Send one heartbeat (or ``/fail``) if due. Never raises.
+
+        ``fail`` signals a required node is down; it sends ``/fail`` only when
+        ``fail_on_node_down`` is on, otherwise a plain success ping (liveness only).
+
+        Returns ``True`` if a request was sent and accepted, else ``False`` (disabled,
+        misconfigured, throttled, or the request failed).
+        """
+        if not self.enabled:
+            return False
+        if not self.url:
+            # Enabled but nothing to ping — surface the misconfig once, then stay quiet.
+            if not self._warned_misconfig:
+                logger.warning(
+                    "Healthchecks enabled but no ping_url configured — not pinging. "
+                    "Set healthchecks.ping_url in config.json."
+                )
+                self._warned_misconfig = True
+            return False
+
+        now = self._clock()
+        if not self._due(now):
+            return False
+
+        endpoint = self.url + "/fail" if (fail and self.fail_on_node_down) else self.url
+        try:
+            requests.get(endpoint, timeout=_PING_TIMEOUT_SEC)
+            # Advance the throttle only on success so a transient outage keeps retrying.
+            self._last_ping = now
+            return True
+        except requests.RequestException as e:
+            # Offline / Tor-only / endpoint hiccup: the whole point is to survive these
+            # silently — Healthchecks.io will alert on the missed ping. DEBUG, never noise.
+            logger.debug("Healthchecks ping failed (offline?): %s", e)
+            return False
+        except Exception as e:  # pragma: no cover - defensive; never break the loop
+            logger.debug("Healthchecks unexpected error: %s", e)
+            return False
@@ -565,6 +565,80 @@ async def test_run_nonblocking_tari_releases_and_stays_operational(self):
         assert svc.latest_data["global_sync"] is False
         assert svc.latest_data["tari_syncing_passive"] is True
 
+    async def _run_one_iteration(self, svc, monero_sync, tari_sync):
+        """Drive a single loop iteration with the given per-node sync signals."""
+        worker_client = MagicMock()
+        worker_client.get_stats = AsyncMock(return_value={})
+        tari_client = MagicMock()
+        tari_client.get_sync_status = AsyncMock(return_value=tari_sync)
+        tari_client.close = AsyncMock()
+
+        with patch.object(ds_mod, "ClientSession", _FakeClientSession), \
+             patch.object(ds_mod, "XMRigWorkerClient", return_value=worker_client), \
+             patch.object(ds_mod, "TariClient", return_value=tari_client), \
+             patch.object(ds_mod, "get_stratum_stats", return_value=({}, [])), \
+             patch.object(ds_mod, "get_network_stats", return_value={"height": 100}), \
+             patch.object(ds_mod, "get_tari_stats", return_value={"active": True, "status": "OK", "height": 3}), \
+             patch.object(ds_mod, "get_p2pool_stats", return_value={"pool": {"last_share_time": 0, "difficulty": 0}}), \
+             patch.object(ds_mod, "get_monero_sync_status", AsyncMock(return_value=monero_sync)), \
+             patch.object(ds_mod, "get_disk_usage", return_value={}), \
+             patch.object(ds_mod, "get_hugepages_status", return_value=("Enabled", "ok", "1/2")), \
+             patch.object(ds_mod, "get_memory_usage", return_value={}), \
+             patch.object(ds_mod, "get_load_average", return_value="0"), \
+             patch.object(ds_mod, "get_cpu_usage", return_value="0%"), \
+             patch("asyncio.sleep", AsyncMock(side_effect=StopAsyncIteration)):
+            with pytest.raises(StopAsyncIteration):
+                await svc.run()
+
+    async def test_healthchecks_pinged_when_healthy(self):
+        # Both nodes reachable & synced → a plain success ping (fail=False) each cycle.
+        svc, sm, proxy = _make_service()
+        proxy.get_workers.return_value = {"workers": []}
+        svc.healthchecks = MagicMock()
+        svc.healthchecks.enabled = True
+        svc.healthchecks.ping.return_value = True
+
+        await self._run_one_iteration(
+            svc,
+            monero_sync={"is_syncing": False, "reachable": True, "percent": 100, "current": 100, "target": 100},
+            tari_sync={"is_syncing": False, "reachable": True},
+        )
+        svc.healthchecks.ping.assert_called_once_with(fail=False)
+
+    async def test_healthchecks_fail_when_required_node_down(self):
+        # A debounced-down required node (monerod) → /fail signal (fail=True).
+        svc, sm, proxy = _make_service()
+        proxy.get_workers.return_value = {"workers": []}
+        svc.healthchecks = MagicMock()
+        svc.healthchecks.enabled = True
+        svc.healthchecks.ping.return_value = True
+        # Force the debounced node-health verdict to DOWN for this cycle.
+        svc.monero_health = MagicMock()
+        svc.monero_health.update.return_value = True
+        svc.tari_health = MagicMock()
+        svc.tari_health.update.return_value = False
+
+        await self._run_one_iteration(
+            svc,
+            monero_sync={"is_syncing": False, "reachable": False},
+            tari_sync={"is_syncing": False, "reachable": True},
+        )
+        svc.healthchecks.ping.assert_called_once_with(fail=True)
+
+    async def test_healthchecks_not_pinged_when_disabled(self):
+        # Default: the disabled client is never invoked from the loop (no worker thread).
+        svc, sm, proxy = _make_service()
+        proxy.get_workers.return_value = {"workers": []}
+        svc.healthchecks = MagicMock()
+        svc.healthchecks.enabled = False
+
+        await self._run_one_iteration(
+            svc,
+            monero_sync={"is_syncing": False, "reachable": True},
+            tari_sync={"is_syncing": False, "reachable": True},
+        )
+        svc.healthchecks.ping.assert_not_called()
+
     async def test_iteration_survives_collector_error(self):
         svc, sm, proxy = _make_service()
         worker_client = MagicMock()