Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,15 @@ per the process in [`docs/releasing.md`](docs/releasing.md).

### Added

- **Healthchecks.io dead-man's switch** (`healthchecks.*` in `config.json`, **default off**):
an optional external liveness monitor. When enabled, the dashboard loop pings a unique
Healthchecks.io URL every cycle; if the host dies (power loss, kernel panic, NIC death) the
pings stop and Healthchecks.io alerts you — the one failure mode an in-stack notifier can't
report from a dead machine. Optionally sends `/fail` while a required node is down
(`signal_fail_on_node_down`, on by default), supports self-hosted instances via `base_url`,
and fails silently when offline / Tor-only. Manual setup (paste the ping URL; the URL is
stored as a secret in the owner-only `.env`). See [`docs/monitoring.md`](docs/monitoring.md)
(#79).
- Dashboard header shows the host's **IP address** next to the hostname when the configured
`dashboard.host` is a name, as `hostname @ ip` (e.g. `pithead.local @ 192.168.1.42`), so you can still reach the
dashboard when the hostname doesn't resolve from your phone or another machine on the LAN. The
Expand Down
32 changes: 32 additions & 0 deletions build/dashboard/mining_dashboard/config/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,38 @@
NODE_DOWN_AFTER_SEC = int(os.environ.get("NODE_DOWN_AFTER_SEC", 90))
NODE_RECOVERY_AFTER_SEC = int(os.environ.get("NODE_RECOVERY_AFTER_SEC", 60))

# --- Healthchecks.io dead-man's switch (Issue #79) ---
# Optional external liveness monitor. When enabled, the dashboard loop pings a unique URL
# every cycle; if the whole host dies (power loss, kernel panic, NIC death) the dashboard
# dies with it, the pings stop, and Healthchecks.io alerts the operator on the *absence* of
# a ping — the one failure mode an in-stack notifier (#45) structurally can't report.
# Default OFF: with HEALTHCHECKS_ENABLED unset nothing ever pings and there are no errors.
HEALTHCHECKS_ENABLED = os.environ.get("HEALTHCHECKS_ENABLED", "false").strip().lower() == "true"

# Manual mode (MVP): paste the full ping URL Healthchecks.io shows you, e.g.
# https://hc-ping.com/<uuid>. A bare uuid/slug is also accepted and is joined onto
# HEALTHCHECKS_BASE_URL — which is what `base_url` is for: pointing at a *self-hosted*
# Healthchecks instance instead of the hosted hc-ping.com. No API key is stored (model B
# auto-provisioning is intentionally out of scope).
HEALTHCHECKS_PING_URL = os.environ.get("HEALTHCHECKS_PING_URL", "").strip()
HEALTHCHECKS_BASE_URL = os.environ.get("HEALTHCHECKS_BASE_URL", "https://hc-ping.com").strip()

# How often to ping. The loop runs every UPDATE_INTERVAL, so this is a throttle floor — a
# value below UPDATE_INTERVAL just pings every cycle. Set your Healthchecks period + grace
# comfortably above this so a single missed cycle (e.g. a dashboard restart) doesn't alert.
try:
HEALTHCHECKS_INTERVAL_SEC = int(os.environ.get("HEALTHCHECKS_INTERVAL_SECONDS", "60"))
except ValueError:
HEALTHCHECKS_INTERVAL_SEC = 60
if HEALTHCHECKS_INTERVAL_SEC < 0:
HEALTHCHECKS_INTERVAL_SEC = 0

# When true, send Healthchecks a `/fail` (instead of a success ping) while a *required* node
# is down — i.e. the same condition that rejects workers in #31 (monerod always; Tari only
# when TARI_REQUIRED). This makes the check health-aware: it goes red on a degraded-but-alive
# stack, not just a dead host. Set false for plain liveness (only host death trips it).
HEALTHCHECKS_FAIL_ON_NODE_DOWN = os.environ.get("HEALTHCHECKS_FAIL_ON_NODE_DOWN", "true").strip().lower() == "true"

# --- Monero Configuration ---
# Used to determine if the node is local (Docker) or remote
MONERO_NODE_HOST = os.environ.get("MONERO_NODE_HOST", "172.28.0.26")
Expand Down
16 changes: 16 additions & 0 deletions build/dashboard/mining_dashboard/service/data_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
from mining_dashboard.collector.logs import get_monero_sync_status
from mining_dashboard.collector.system import get_disk_usage, get_hugepages_status, get_memory_usage, get_load_average, get_cpu_usage
from mining_dashboard.service.node_health import NodeHealthMonitor
from mining_dashboard.service.healthchecks import HealthchecksClient

logger = logging.getLogger("DataService")

Expand Down Expand Up @@ -206,6 +207,11 @@ def __init__(self, state_manager, proxy_client, xvb_client):
self.docker_control = DockerControl()
self.monero_health = NodeHealthMonitor()
self.tari_health = NodeHealthMonitor()

# Healthchecks.io dead-man's switch (Issue #79). Disabled by default — when off this is
# a no-op. When on, each cycle pings a unique URL; the alert fires externally on the
# *absence* of a ping, so it survives a host death the in-stack notifier can't report.
self.healthchecks = HealthchecksClient.from_config()
# True while we've stopped the proxy to reject workers. Persisted in the snapshot so
# a dashboard restart mid-outage still readmits workers once the node recovers.
self.workers_rejected = False
Expand Down Expand Up @@ -478,6 +484,16 @@ async def run(self):
snapshot_data.pop("shares", None)
await asyncio.to_thread(self.state_manager.save_snapshot, snapshot_data)

# 6b. Healthchecks.io dead-man's switch (Issue #79). Ping each cycle so the
# external monitor alerts on the *absence* of a ping if the host ever dies
# (power loss, crash, NIC death). Send /fail instead while a *required* node
# is down — the same predicate as #31's worker rejection: monerod always,
# Tari only when required. The client throttles and fails silently; gate on
# `enabled` so a disabled (default) stack never spawns the worker thread.
if self.healthchecks.enabled:
required_node_down = monero_down or (tari_down and TARI_REQUIRED)
await asyncio.to_thread(self.healthchecks.ping, fail=required_node_down)

# 7. External API Sync (Throttled to every 10th iteration)
if iteration_count % 10 == 0:
real_xvb_stats = await asyncio.to_thread(self.xvb_client.get_stats)
Expand Down
145 changes: 145 additions & 0 deletions build/dashboard/mining_dashboard/service/healthchecks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
"""Healthchecks.io dead-man's-switch pinger (Issue #79).

A thin, self-contained client the data loop calls once per cycle to ping a unique
Healthchecks.io URL. The value is in what *stops* happening: if the host dies, the
dashboard dies with it, the pings stop, and Healthchecks.io fires an alert on the absence
of a ping — evaluated on *their* servers, so it survives the very outage (power loss, kernel
panic, NIC death) an in-stack notifier can't report from a dead machine.

Design notes:
- **Default off.** A disabled client is a no-op: :meth:`ping` returns immediately, opens no
socket, and logs nothing.
- **Fails silently.** A ping that can't reach the endpoint (offline, or a Tor-only host that
can't reach clearnet hc-ping.com) is logged at DEBUG only — never WARNING/ERROR — so the
log stays quiet, consistent with the stack's offline check discipline (#59). A blank
``ping_url`` while *enabled* is a genuine misconfiguration and warns once.
- **Throttled.** :data:`interval` is a floor between pings; the loop calls every cycle but we
only hit the network once per interval. The throttle clock only advances on a *successful*
send, so while offline we keep retrying every cycle rather than backing off.

Manual setup only (MVP): the operator pastes the ping URL from Healthchecks.io. Auto-
provisioning via the Management API (which would mean storing a powerful API key) is
intentionally left out — see ``docs/monitoring.md``.
"""
import logging
import time

import requests

from mining_dashboard.config.config import (
HEALTHCHECKS_ENABLED,
HEALTHCHECKS_PING_URL,
HEALTHCHECKS_BASE_URL,
HEALTHCHECKS_INTERVAL_SEC,
HEALTHCHECKS_FAIL_ON_NODE_DOWN,
)

logger = logging.getLogger("Healthchecks")

# A ping is a tiny request; keep the timeout short so a hung endpoint can't stall the loop's
# worker thread for long. Healthchecks.io recommends GET/HEAD/POST to the ping URL.
_PING_TIMEOUT_SEC = 10


def _resolve_ping_url(ping_url, base_url):
"""Resolve the configured ping URL into a full success endpoint, or ``""`` if unset.

Two accepted shapes, so the same config works for hosted and self-hosted instances:

- A full ``http(s)://...`` URL (what Healthchecks.io shows you) is used as-is. This already
carries the host, so self-hosted is supported by pasting the self-hosted URL — no
``base_url`` needed.
- A bare uuid/slug is joined onto ``base_url`` (default ``https://hc-ping.com``; override it
to point at a self-hosted instance, e.g. ``https://hc.example.com/ping``).

Trailing slashes are normalised so the ``/fail`` and ``/start`` suffixes append cleanly.
"""
ping_url = (ping_url or "").strip()
if not ping_url:
return ""
if ping_url.startswith(("http://", "https://")):
return ping_url.rstrip("/")
return (base_url or "").rstrip("/") + "/" + ping_url.lstrip("/")


class HealthchecksClient:
"""Pings a Healthchecks.io check on a throttle; safe to call every loop cycle."""

def __init__(self, enabled, ping_url, base_url, interval_seconds,
fail_on_node_down, clock=time.monotonic):
self.enabled = bool(enabled)
self.url = _resolve_ping_url(ping_url, base_url)
self.interval = max(0, int(interval_seconds or 0))
self.fail_on_node_down = bool(fail_on_node_down)
self._clock = clock
self._last_ping = None # monotonic time of the last *successful* send
self._warned_misconfig = False

if self.enabled and self.url:
logger.info(
"Healthchecks.io dead-man's switch enabled (ping every %ss%s).",
self.interval,
", /fail on required-node-down" if self.fail_on_node_down else "",
)

@classmethod
def from_config(cls):
"""Build a client from the module-level config (env-backed) values."""
return cls(
enabled=HEALTHCHECKS_ENABLED,
ping_url=HEALTHCHECKS_PING_URL,
base_url=HEALTHCHECKS_BASE_URL,
interval_seconds=HEALTHCHECKS_INTERVAL_SEC,
fail_on_node_down=HEALTHCHECKS_FAIL_ON_NODE_DOWN,
)

@property
def active(self):
"""True only when enabled *and* a usable ping URL is configured."""
return self.enabled and bool(self.url)

def _due(self, now):
"""Whether enough time has passed since the last successful ping to send another."""
if self._last_ping is None:
return True
return (now - self._last_ping) >= self.interval

def ping(self, fail=False):
"""Send one heartbeat (or ``/fail``) if due. Never raises.

``fail`` signals a required node is down; it sends ``/fail`` only when
``fail_on_node_down`` is on, otherwise a plain success ping (liveness only).

Returns ``True`` if a request was sent and accepted, else ``False`` (disabled,
misconfigured, throttled, or the request failed).
"""
if not self.enabled:
return False
if not self.url:
# Enabled but nothing to ping — surface the misconfig once, then stay quiet.
if not self._warned_misconfig:
logger.warning(
"Healthchecks enabled but no ping_url configured — not pinging. "
"Set healthchecks.ping_url in config.json."
)
self._warned_misconfig = True
return False

now = self._clock()
if not self._due(now):
return False

endpoint = self.url + "/fail" if (fail and self.fail_on_node_down) else self.url
try:
requests.get(endpoint, timeout=_PING_TIMEOUT_SEC)
# Advance the throttle only on success so a transient outage keeps retrying.
self._last_ping = now
return True
except requests.RequestException as e:
# Offline / Tor-only / endpoint hiccup: the whole point is to survive these
# silently — Healthchecks.io will alert on the missed ping. DEBUG, never noise.
logger.debug("Healthchecks ping failed (offline?): %s", e)
return False
except Exception as e: # pragma: no cover - defensive; never break the loop
logger.debug("Healthchecks unexpected error: %s", e)
return False
74 changes: 74 additions & 0 deletions build/dashboard/tests/service/test_data_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -565,6 +565,80 @@ async def test_run_nonblocking_tari_releases_and_stays_operational(self):
assert svc.latest_data["global_sync"] is False
assert svc.latest_data["tari_syncing_passive"] is True

async def _run_one_iteration(self, svc, monero_sync, tari_sync):
"""Drive a single loop iteration with the given per-node sync signals."""
worker_client = MagicMock()
worker_client.get_stats = AsyncMock(return_value={})
tari_client = MagicMock()
tari_client.get_sync_status = AsyncMock(return_value=tari_sync)
tari_client.close = AsyncMock()

with patch.object(ds_mod, "ClientSession", _FakeClientSession), \
patch.object(ds_mod, "XMRigWorkerClient", return_value=worker_client), \
patch.object(ds_mod, "TariClient", return_value=tari_client), \
patch.object(ds_mod, "get_stratum_stats", return_value=({}, [])), \
patch.object(ds_mod, "get_network_stats", return_value={"height": 100}), \
patch.object(ds_mod, "get_tari_stats", return_value={"active": True, "status": "OK", "height": 3}), \
patch.object(ds_mod, "get_p2pool_stats", return_value={"pool": {"last_share_time": 0, "difficulty": 0}}), \
patch.object(ds_mod, "get_monero_sync_status", AsyncMock(return_value=monero_sync)), \
patch.object(ds_mod, "get_disk_usage", return_value={}), \
patch.object(ds_mod, "get_hugepages_status", return_value=("Enabled", "ok", "1/2")), \
patch.object(ds_mod, "get_memory_usage", return_value={}), \
patch.object(ds_mod, "get_load_average", return_value="0"), \
patch.object(ds_mod, "get_cpu_usage", return_value="0%"), \
patch("asyncio.sleep", AsyncMock(side_effect=StopAsyncIteration)):
with pytest.raises(StopAsyncIteration):
await svc.run()

async def test_healthchecks_pinged_when_healthy(self):
# Both nodes reachable & synced → a plain success ping (fail=False) each cycle.
svc, sm, proxy = _make_service()
proxy.get_workers.return_value = {"workers": []}
svc.healthchecks = MagicMock()
svc.healthchecks.enabled = True
svc.healthchecks.ping.return_value = True

await self._run_one_iteration(
svc,
monero_sync={"is_syncing": False, "reachable": True, "percent": 100, "current": 100, "target": 100},
tari_sync={"is_syncing": False, "reachable": True},
)
svc.healthchecks.ping.assert_called_once_with(fail=False)

async def test_healthchecks_fail_when_required_node_down(self):
# A debounced-down required node (monerod) → /fail signal (fail=True).
svc, sm, proxy = _make_service()
proxy.get_workers.return_value = {"workers": []}
svc.healthchecks = MagicMock()
svc.healthchecks.enabled = True
svc.healthchecks.ping.return_value = True
# Force the debounced node-health verdict to DOWN for this cycle.
svc.monero_health = MagicMock()
svc.monero_health.update.return_value = True
svc.tari_health = MagicMock()
svc.tari_health.update.return_value = False

await self._run_one_iteration(
svc,
monero_sync={"is_syncing": False, "reachable": False},
tari_sync={"is_syncing": False, "reachable": True},
)
svc.healthchecks.ping.assert_called_once_with(fail=True)

async def test_healthchecks_not_pinged_when_disabled(self):
# Default: the disabled client is never invoked from the loop (no worker thread).
svc, sm, proxy = _make_service()
proxy.get_workers.return_value = {"workers": []}
svc.healthchecks = MagicMock()
svc.healthchecks.enabled = False

await self._run_one_iteration(
svc,
monero_sync={"is_syncing": False, "reachable": True},
tari_sync={"is_syncing": False, "reachable": True},
)
svc.healthchecks.ping.assert_not_called()

async def test_iteration_survives_collector_error(self):
svc, sm, proxy = _make_service()
worker_client = MagicMock()
Expand Down
Loading
Loading