From 0406af522c6fa189a9b2ebacbc4dd4dffc070b3e Mon Sep 17 00:00:00 2001 From: archipelago Date: Wed, 24 Jun 2026 05:27:10 -0400 Subject: [PATCH] test(lifecycle): add manifest-driven all-apps health matrix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The per-app suites cover ~8 core apps in depth; nothing covered the ~30 others (jellyfin, vaultwarden, penpot, nextcloud, grafana, …). all-apps-matrix.bats derives the app set from server.get-state package-data (no hardcoded list) and asserts baseline health across EVERY installed app: - settles to a non-transitional state within a window (the #13/#14 stuck-ghost class, generalized fleet-wide — installing/removing that never settles) - not in error/failed - reports a recognized (non-garbage) state - every running UI app (manifest ui=="true") exposes a non-null lan-address (the immich/port-drift unreachable-UI failure, generalized to all UI apps) Read-only, so it joins run.sh/run-gate.sh on every node and grows coverage as nodes install more apps. Verified 5/5 on .228 (17 apps) and .116 (20 apps). Co-Authored-By: Claude Opus 4.8 (1M context) --- tests/lifecycle/TESTING.md | 15 +++ tests/lifecycle/bats/all-apps-matrix.bats | 134 ++++++++++++++++++++++ 2 files changed, 149 insertions(+) create mode 100644 tests/lifecycle/bats/all-apps-matrix.bats diff --git a/tests/lifecycle/TESTING.md b/tests/lifecycle/TESTING.md index 81d39ac0..488f4bd6 100644 --- a/tests/lifecycle/TESTING.md +++ b/tests/lifecycle/TESTING.md @@ -222,6 +222,21 @@ so it's safe on a populated node. Override the app via `ARCHY_CASCADE_APP` / `ARCHY_CASCADE_IMAGE` / `ARCHY_CASCADE_CONFIG` / `ARCHY_CASCADE_DATA_DIR`. Gated on `ARCHY_ALLOW_CASCADE_DESTRUCTIVE=1`. Verified 7/7 on .228 (2026-06-24). +### All-apps lifecycle matrix (Workstream F) + +The per-app suites cover ~8 core apps in depth; `all-apps-matrix.bats` covers +**every installed app in breadth, automatically** — it derives the app set from +`server.get-state` package-data (no hardcoded list) and grows coverage as nodes +install more apps. **Read-only**, so it joins `run.sh`/`run-gate.sh` on every node. + +| Suite | Guards (fleet-wide) | Asserts (per installed app) | +|---|---|---| +| `all-apps-matrix.bats` | apps STUCK transitional (the #13/#14 ghost generalized), error/failed apps, unreachable UI apps (port-drift generalized) | settles to a non-transitional state within a window; not error/failed; recognized (non-garbage) state; every **running UI app** (manifest `ui=="true"`) exposes a non-null lan-address | + +Tunables: `ARCHY_MATRIX_SETTLE_SECS` (45), `ARCHY_MATRIX_UI_SECS` (30), +`ARCHY_MATRIX_ALLOW_STOPPED` (ids allowed non-running). Verified 5/5 on .228 +(17 apps) and .116 (20 apps incl. grafana/nextcloud/photoprism/gitea), 2026-06-24. + To exercise the Phase 3.2 Quadlet-backend path on a target node without editing config.json (which would require an archipelago restart and trigger FM3 until 3.5 ships), set the env var on `archipelago.service`: diff --git a/tests/lifecycle/bats/all-apps-matrix.bats b/tests/lifecycle/bats/all-apps-matrix.bats new file mode 100644 index 00000000..1644f493 --- /dev/null +++ b/tests/lifecycle/bats/all-apps-matrix.bats @@ -0,0 +1,134 @@ +#!/usr/bin/env bats +# tests/lifecycle/bats/all-apps-matrix.bats +# +# Manifest-driven, fleet-wide lifecycle health matrix. The per-app suites +# (bitcoin-knots, lnd, mempool, immich, …) cover ~8 core apps in depth; this +# covers EVERY installed app in breadth, automatically — no hardcoded list. +# +# It derives the app set from server.get-state's package-data (the My Apps map) +# and asserts baseline health across all of them. Read-only (no destructive env +# needed), so it joins run.sh / run-gate.sh on every node and grows coverage as +# nodes install more apps. +# +# Catches, fleet-wide, the bug classes the narrow gate missed: +# - apps STUCK in a transitional state (the #13/#14 ghost: installing/removing +# that never settles) +# - apps sitting in error/failed +# - running UI apps with no reachable lan-address (generalized port-drift) + +load '../lib/rpc.bash' + +# Transitional states are legitimate momentarily but must not PERSIST. Steady: +# running/stopped/exited/created/paused/installed/not-installed. +TRANSITIONAL_RE='^(installing|pulling-image|pulling|downloading|removing|uninstalling|updating|starting|stopping|restarting)$' +BAD_RE='^(error|failed)$' + +# Apps whose state is allowed to be non-running at rest (no UI/health expectation +# beyond "settled"). Empty by default; override via ARCHY_MATRIX_ALLOW_STOPPED +# (space-separated ids) on nodes where an app is intentionally left stopped. +ALLOW_STOPPED="${ARCHY_MATRIX_ALLOW_STOPPED:-}" + +setup_file() { + : "${ARCHY_PASSWORD:?Set ARCHY_PASSWORD env var to the UI password}" + export ARCHY_FORCE_LOGIN=1 + rpc_login + unset ARCHY_FORCE_LOGIN +} + +teardown_file() { + rpc_logout_local +} + +# Echo the package-data object (the My Apps map) once. +get_package_data() { + rpc_result server.get-state '{}' 2>/dev/null | jq -c '.data["package-data"] // {}' +} + +# Space-separated list of installed app ids. +app_ids() { + get_package_data | jq -r 'keys[]' +} + +# ──────────────────────────────────────────────────────────────────── +@test "matrix has apps to check (get-state returns a non-empty My Apps map)" { + run app_ids + [ "$status" -eq 0 ] + [ -n "$output" ] + echo "# matrix covers $(echo "$output" | wc -w) apps: $(echo $output)" >&3 +} + +@test "no installed app is STUCK in a transitional state (settles within window)" { + local settle="${ARCHY_MATRIX_SETTLE_SECS:-45}" + local deadline=$(( $(date +%s) + settle )) + local stuck="" + # Re-poll: a transitional state right now may just be a genuine in-progress op, + # so only fail apps that are STILL transitional after the settle window. + while :; do + stuck="" + local pd; pd=$(get_package_data) + for id in $(echo "$pd" | jq -r 'keys[]'); do + local st; st=$(echo "$pd" | jq -r --arg i "$id" '.[$i].state // "unknown"') + [[ "$st" =~ $TRANSITIONAL_RE ]] && stuck+="${id}=${st} " + done + [[ -z "$stuck" ]] && break + (( $(date +%s) >= deadline )) && break + sleep 5 + done + [[ -z "$stuck" ]] || { echo "# STUCK transitional after ${settle}s: $stuck" >&3; false; } +} + +@test "no installed app is in an error/failed state" { + local pd; pd=$(get_package_data) + local bad="" + for id in $(echo "$pd" | jq -r 'keys[]'); do + local st; st=$(echo "$pd" | jq -r --arg i "$id" '.[$i].state // "unknown"') + [[ "$st" =~ $BAD_RE ]] && bad+="${id}=${st} " + done + [[ -z "$bad" ]] || { echo "# error/failed apps: $bad" >&3; false; } +} + +@test "every running app reports a recognized state (no empty/garbage state)" { + local pd; pd=$(get_package_data) + local junk="" + for id in $(echo "$pd" | jq -r 'keys[]'); do + local st; st=$(echo "$pd" | jq -r --arg i "$id" '.[$i].state // "unknown"') + case "$st" in + running|stopped|exited|created|paused|installed|not-installed|\ +installing|pulling-image|pulling|downloading|removing|uninstalling|updating|starting|stopping|restarting|\ +error|failed|degraded) : ;; + *) junk+="${id}='${st}' " ;; + esac + done + [[ -z "$junk" ]] || { echo "# unrecognized state values: $junk" >&3; false; } +} + +@test "every running UI app exposes a lan-address (generalized port-drift)" { + # A running app whose manifest declares a UI interface (ui=="true") must have a + # non-null lan-address on that interface — otherwise its UI is unreachable + # (the immich/port-drift failure mode, asserted across ALL UI apps). Poll + # briefly to absorb the transient null seen while a container is mid-recreate. + local deadline=$(( $(date +%s) + ${ARCHY_MATRIX_UI_SECS:-30} )) + local missing="" + while :; do + missing="" + local pd; pd=$(get_package_data) + for id in $(echo "$pd" | jq -r 'keys[]'); do + local st; st=$(echo "$pd" | jq -r --arg i "$id" '.[$i].state // "unknown"') + [[ "$st" == "running" ]] || continue + # interface keys whose manifest marks ui=="true" + local ui_ifaces + ui_ifaces=$(echo "$pd" | jq -r --arg i "$id" \ + '.[$i].manifest.interfaces // {} | to_entries[] | select(.value.ui=="true") | .key') + for k in $ui_ifaces; do + local addr + addr=$(echo "$pd" | jq -r --arg i "$id" --arg k "$k" \ + '.[$i].installed["interface-addresses"][$k]["lan-address"] // "null"') + [[ "$addr" == "null" || -z "$addr" ]] && missing+="${id}:${k} " + done + done + [[ -z "$missing" ]] && break + (( $(date +%s) >= deadline )) && break + sleep 3 + done + [[ -z "$missing" ]] || { echo "# running UI apps missing lan-address: $missing" >&3; false; } +}