diff --git a/CHANGELOG.md b/CHANGELOG.md index 35bf67a..5ccbe9f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -66,6 +66,14 @@ per the process in [`docs/releasing.md`](docs/releasing.md). `docker-proxy`, and `docker-control` previously fell back to Docker's uncapped default, so their logs could grow without bound and fill the disk on a long-running host (#123). +### Fixed + +- `pithead backup` no longer aborts when `du`/`df` exit non-zero on an unreadable file or a + transient FS error — the disk-space pre-check now degrades gracefully (its "proceeding without + a space check" fallback was previously unreachable under `set -e`) (#127). +- `pithead doctor` now exits non-zero when a critical check FAILS, so it can be used as a + cron/CI/monitoring health gate (it previously always exited 0); warnings alone still exit 0 (#127). + ### Security - The monerod RPC credentials are no longer interpolated into the compose healthcheck command diff --git a/pithead b/pithead index 1ddc7f2..79cdb4b 100755 --- a/pithead +++ b/pithead @@ -371,6 +371,9 @@ doctor() { else log "All checks passed." fi + # Propagate a non-zero exit when any critical check FAILED, so `doctor` is usable as a health + # gate in cron/CI/monitoring (mirrors `status`). Warnings alone still exit 0 (#127). + [ "$DR_FAIL" -gt 0 ] && return 1 return 0 } @@ -483,8 +486,12 @@ stack_backup() { # so we assume the archive needs roughly the source size plus a ~5% safety margin. local need_kb avail_kb # sudo: the Tor data dir is owned by 100:101; -c gives a grand total across all items. - need_kb=$(sudo du -sck "${items[@]}" 2>/dev/null | awk 'END{print $1}') - avail_kb=$(df -Pk "$backups_dir" 2>/dev/null | awk 'NR==2{print $4}') + # `|| true`: du exits non-zero if it can't read any descendant (a permission-denied subdir, a + # file vanishing mid-walk, an NFS hiccup) even though 2>/dev/null hides the message and a total + # is still printed. Without this, errexit (set -e) aborts the whole backup on a bare assignment, + # making the graceful "proceeding without a space check" fallback below unreachable (#127). + need_kb=$(sudo du -sck "${items[@]}" 2>/dev/null | awk 'END{print $1}') || true + avail_kb=$(df -Pk "$backups_dir" 2>/dev/null | awk 'NR==2{print $4}') || true if [ -n "$need_kb" ] && [ -n "$avail_kb" ]; then if [ "$avail_kb" -lt "$((need_kb + need_kb / 20))" ]; then local avail_gib=$((avail_kb / 1048576)) need_gib=$((need_kb / 1048576)) @@ -1743,7 +1750,7 @@ main() { upgrade) require_deployed; stack_upgrade ;; logs) require_env; log "Following logs (Ctrl+C to exit)..."; docker compose logs -f "$@" ;; status) require_env; stack_status || exit 1 ;; - doctor) doctor ;; + doctor) doctor || exit 1 ;; reset-dashboard) require_deployed; reset_dashboard "$@" ;; backup) stack_backup "$@" ;; restore) stack_restore "$@" ;; diff --git a/tests/stack/run.sh b/tests/stack/run.sh index 77045f6..add7a9a 100755 --- a/tests/stack/run.sh +++ b/tests/stack/run.sh @@ -350,6 +350,25 @@ REMOTE="tor=running:healthy monerod=missing p2pool=running:none tari=running:hea out="$(cd "$ST" && FAKE_STATES="$REMOTE" PATH="$ST/bin:$PATH" ./pithead status 2>&1)"; rc=$? assert_rc "status: remote mode ignores monerod" "$rc" "0" +echo "== black-box: doctor exit code (#127) ==" +# doctor must EXIT NON-ZERO when a critical check fails, so it's usable as a cron/CI health gate +# (it previously always returned 0). Drive one failure via an unreachable Docker daemon; jq/openssl +# stay real on PATH so only the daemon check fails. +DOC="$SANDBOX/doctor"; mkdir -p "$DOC/bin"; cp "$STACK" "$DOC/pithead" +cat > "$DOC/bin/docker" <<'EOF' +#!/usr/bin/env bash +case "$*" in + "info") exit 1 ;; # daemon unreachable -> doctor records a critical FAIL + *) exit 0 ;; # `--version`, `compose version`, etc. succeed +esac +EOF +printf '#!/usr/bin/env bash\nexit 0\n' > "$DOC/bin/sudo" +chmod +x "$DOC/bin/docker" "$DOC/bin/sudo" +out="$(cd "$DOC" && PATH="$DOC/bin:$PATH" ./pithead doctor 2>&1)"; rc=$? +assert_contains "doctor runs to the summary" "$out" "Diagnostics summary" +assert_contains "doctor flags the unreachable daemon" "$out" "Docker daemon is not reachable" +assert_rc "doctor exits 1 on a critical FAIL" "$rc" "1" + # --------------------------------------------------------------------------- echo "" printf 'pithead tests: \033[1;32m%d passed\033[0m, ' "$PASS"