archy/tests/lifecycle/os-audit.sh
archipelago 329e7811eb test(lifecycle): add os-audit OS-wide health gate; docs: v1.7.91 resume notes
os-audit.sh: one non-destructive scorecard tying backend/RPC health, the
all-apps lifecycle audit (delegates to remote-lifecycle.sh), and the FM-guards
(port-drift, secret-completeness, orphan-container sweep, OTA-wedge). The
per-boot building block for the reboot-survival loop. FM12 check uses jq has()
not // (// treats a legit false as empty). Section A validated all-PASS on .116.

docs: v1.7.91 release-pass resume notes + the bitcoinReceive blocker writeup.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-14 04:36:06 -04:00

240 lines
10 KiB
Bash
Executable File

#!/usr/bin/env bash
# tests/lifecycle/os-audit.sh — one non-destructive OS-wide health gate.
#
# Ties together, in a single pass with one scorecard + exit code:
# A. Backend / RPC health — node is up, not wedged mid-OTA, core daemons answer
# B. All-apps lifecycle audit — every catalog app: valid state, real health,
# reachable launch URL, populated launch metadata
# (delegates to remote-lifecycle.sh, audit-only)
# C. FM-guards — the concrete failure modes that have bitten the
# fleet: port-drift (FM8), secret-completeness (FM2),
# orphaned container states (FM9), OTA wedge (FM12)
#
# Everything here is READ-ONLY: no install/stop/start/uninstall, no service bounce.
# Safe to run against a live production node. It is the per-boot building block the
# reboot-survival harness (L3) calls after each reboot.
#
# Env:
# ARCHY_HOST (default 127.0.0.1)
# ARCHY_SCHEME (default https; use http for .116 / nginx-:80-only nodes)
# ARCHY_PASSWORD (required)
# ARCHY_LOCAL (auto: 1 when ARCHY_HOST is loopback) — gates host-only podman checks
#
# Usage:
# ARCHY_HOST=127.0.0.1 ARCHY_SCHEME=http ARCHY_PASSWORD=... tests/lifecycle/os-audit.sh
#
# Exit: 0 = every section green; 1 = one or more checks failed; 2 = setup/usage error.
set -uo pipefail
HERE="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)"
ARCHY_HOST="${ARCHY_HOST:-127.0.0.1}"
ARCHY_SCHEME="${ARCHY_SCHEME:-https}"
ARCHY_PASSWORD="${ARCHY_PASSWORD:-}"
BASE_URL="${ARCHY_SCHEME}://${ARCHY_HOST}"
# Host-only checks (podman sweeps) make sense only when this script runs ON the node.
if [[ -z "${ARCHY_LOCAL:-}" ]]; then
case "$ARCHY_HOST" in
127.0.0.1|localhost|::1) ARCHY_LOCAL=1 ;;
*) ARCHY_LOCAL=0 ;;
esac
fi
if [[ -z "$ARCHY_PASSWORD" ]]; then
echo "ARCHY_PASSWORD env var must be set." >&2
exit 2
fi
for tool in curl jq; do
command -v "$tool" >/dev/null 2>&1 || { echo "missing required tool: $tool" >&2; exit 2; }
done
# ── scorecard state ───────────────────────────────────────────────────────────
PASS=0; FAIL=0; WARN=0
declare -a RESULTS=()
record() { # record <PASS|FAIL|WARN> <label> [detail]
local status="$1" label="$2" detail="${3:-}"
case "$status" in
PASS) PASS=$((PASS+1)) ;;
FAIL) FAIL=$((FAIL+1)) ;;
WARN) WARN=$((WARN+1)) ;;
esac
RESULTS+=("$(printf '%-4s %-38s %s' "$status" "$label" "$detail")")
printf ' [%s] %s %s\n' "$status" "$label" "$detail"
}
# ── minimal RPC client (session + CSRF) ────────────────────────────────────────
SESSION=""; CSRF=""
rpc_login() {
local hdr; hdr=$(mktemp)
curl -sk -D "$hdr" -X POST "${BASE_URL}/rpc/v1" -H 'Content-Type: application/json' \
-d "$(jq -nc --arg p "$ARCHY_PASSWORD" '{jsonrpc:"2.0",id:1,method:"auth.login",params:{password:$p}}')" \
-o /dev/null 2>/dev/null
SESSION=$(grep -i '^set-cookie: session=' "$hdr" | head -1 | sed -E 's/.*session=([^;]+).*/\1/' | tr -d '\r')
CSRF=$(grep -i '^set-cookie: csrf_token=' "$hdr" | head -1 | sed -E 's/.*csrf_token=([^;]+).*/\1/' | tr -d '\r')
rm -f "$hdr"
[[ -n "$SESSION" && -n "$CSRF" ]]
}
# rpc <method> [params-json] -> prints raw JSON response
rpc() {
local method="$1" params="${2:-{\}}"
curl -sk -X POST "${BASE_URL}/rpc/v1" -H 'Content-Type: application/json' \
-H "Cookie: session=${SESSION}; csrf_token=${CSRF}" -H "X-CSRF-Token: ${CSRF}" \
-d "$(jq -nc --arg m "$method" --argjson p "$params" '{jsonrpc:"2.0",id:2,method:$m,params:$p}')" 2>/dev/null
}
# rpc_ok <method> [params] -> 0 if a result came back with no error
rpc_ok() {
local resp; resp=$(rpc "$@")
[[ -n "$resp" ]] && [[ "$(jq -r '.error // empty' <<<"$resp" 2>/dev/null)" == "" ]] \
&& [[ "$(jq -r 'has("result")' <<<"$resp" 2>/dev/null)" == "true" ]]
}
# ══ Section A — Backend / RPC health ═══════════════════════════════════════════
section_a() {
echo
echo "== A. Backend / RPC health =="
# unauth health probe first (doesn't need a session)
local health; health=$(curl -sk -X POST "${BASE_URL}/rpc/v1" -H 'Content-Type: application/json' \
-d '{"jsonrpc":"2.0","id":1,"method":"health","params":{}}' 2>/dev/null)
if [[ "$(jq -r '.result.status // empty' <<<"$health" 2>/dev/null)" =~ ^(ok|degraded)$ ]]; then
record PASS "node responds (health)" "status=$(jq -r '.result.status' <<<"$health")"
else
record FAIL "node responds (health)" "no/invalid health response — node down?"
return
fi
if ! rpc_login; then
record FAIL "auth.login" "could not establish session (wrong password or rate-limited)"
return
fi
record PASS "auth.login" "session established"
# FM12 — OTA must not be wedged mid-apply.
# NB: must use has() not `//` — jq's `//` treats a legit `false` as empty and
# would fall through to "unknown" on a perfectly healthy node.
local us; us=$(rpc update.status)
local inprog; inprog=$(jq -r '
if (.result|type=="object") and (.result|has("update_in_progress")) then .result.update_in_progress
elif (.result|type=="object") and (.result|has("in_progress")) then .result.in_progress
else "unknown" end' <<<"$us" 2>/dev/null)
if [[ "$inprog" == "false" ]]; then
record PASS "OTA not wedged (update.status)" "update_in_progress=false"
elif [[ "$inprog" == "unknown" ]]; then
record WARN "OTA not wedged (update.status)" "could not read update_in_progress"
else
record FAIL "OTA not wedged (update.status)" "update_in_progress=$inprog (FM12 wedge)"
fi
# Core daemons answer (only assert for ones present on this node)
if rpc_ok bitcoin.getinfo || rpc_ok bitcoin.relay-status; then
record PASS "bitcoin RPC reachable" ""
else
record WARN "bitcoin RPC reachable" "bitcoin.getinfo/relay-status did not answer (not installed?)"
fi
if rpc_ok lnd.getinfo; then
record PASS "lnd RPC reachable" ""
else
record WARN "lnd RPC reachable" "lnd.getinfo did not answer (not installed / wallet locked?)"
fi
if rpc_ok system.stats || rpc_ok system.get-metrics; then
record PASS "system metrics reachable" ""
else
record WARN "system metrics reachable" "system.stats/get-metrics did not answer"
fi
# FM13 — disk pressure early-warning (best-effort; field names vary by version)
local ds; ds=$(rpc system.disk-status)
local usep; usep=$(jq -r '[.result.use_percent,.result.used_percent,.result.percent]|map(select(.!=null))|first // empty' <<<"$ds" 2>/dev/null)
if [[ -n "$usep" ]]; then
if (( ${usep%.*} >= 90 )); then
record FAIL "disk pressure (system.disk-status)" "${usep}% used (FM13 risk)"
else
record PASS "disk pressure (system.disk-status)" "${usep}% used"
fi
fi
}
# ══ Section B — All-apps lifecycle audit (delegates to remote-lifecycle.sh) ═════
section_b() {
echo
echo "== B. All-apps lifecycle audit (non-destructive, all catalog apps) =="
local out rc
# No ARCHY_APPS + no ARCHY_FULL_LIFECYCLE => audit every catalog app (audit_app).
out=$(ARCHY_HOST="$ARCHY_HOST" ARCHY_SCHEME="$ARCHY_SCHEME" ARCHY_PASSWORD="$ARCHY_PASSWORD" \
ARCHY_APPS="" ARCHY_FULL_LIFECYCLE=0 \
"$HERE/remote-lifecycle.sh" 2>&1)
rc=$?
# Surface the per-app lines but drop the noisy optional-probe jq parse errors.
echo "$out" | grep -vE '^jq: (parse )?error' | sed 's/^/ /'
if (( rc == 0 )); then
record PASS "broad all-apps audit" "remote-lifecycle.sh exit 0"
else
local n; n=$(echo "$out" | grep -oE 'FAILED checks: [0-9]+' | grep -oE '[0-9]+' | tail -1)
record FAIL "broad all-apps audit" "remote-lifecycle.sh exit $rc (${n:-?} app checks failed)"
fi
}
# ══ Section C — FM-guards ══════════════════════════════════════════════════════
run_bats_guard() { # run_bats_guard <suite> <label> <fm>
local suite="$1" label="$2" fm="$3" out rc
if ! command -v bats >/dev/null 2>&1; then
record WARN "$label" "bats not installed — $fm guard skipped"
return
fi
out=$(ARCHY_HOST="$ARCHY_HOST" ARCHY_SCHEME="$ARCHY_SCHEME" ARCHY_PASSWORD="$ARCHY_PASSWORD" \
"$HERE/run.sh" "$suite" 2>&1); rc=$?
if (( rc == 0 )); then
record PASS "$label" "$fm guard green"
else
record FAIL "$label" "$fm$(echo "$out" | grep -E '^not ok' | head -1)"
fi
}
section_c() {
echo
echo "== C. FM-guards (the concrete fleet failure modes) =="
run_bats_guard port-drift "port bindings match manifest" "FM8"
run_bats_guard secret-completeness "all referenced secrets exist" "FM2"
# FM9 — orphaned container states (host-only: needs local podman)
if [[ "$ARCHY_LOCAL" == "1" ]] && command -v podman >/dev/null 2>&1; then
local orphans
orphans=$(podman ps -a --format '{{.Names}} {{.Status}}' 2>/dev/null \
| grep -iE '(^| )(stopping|removing|created)( |$)' || true)
if [[ -z "$orphans" ]]; then
record PASS "no orphaned container states" "no stopping/removing/created"
else
record FAIL "no orphaned container states" "FM9: $(echo "$orphans" | tr '\n' ';')"
fi
else
record WARN "no orphaned container states" "remote node — host podman sweep skipped"
fi
}
# ── run ────────────────────────────────────────────────────────────────────────
echo "=============================================================="
echo " OS-wide audit — ${BASE_URL} ($(date '+%Y-%m-%d %H:%M:%S'))"
echo " local=${ARCHY_LOCAL}"
echo "=============================================================="
section_a
# Only proceed to apps/FM-guards if the node itself answered.
if (( FAIL == 0 )) || [[ -n "$SESSION" ]]; then
section_b
section_c
fi
echo
echo "=============================================================="
echo " SCORECARD: ${PASS} pass / ${FAIL} fail / ${WARN} warn"
echo "=============================================================="
printf '%s\n' "${RESULTS[@]}"
echo
if (( FAIL > 0 )); then
echo "RESULT: FAIL ($FAIL critical checks failed)"
exit 1
fi
echo "RESULT: PASS"
exit 0