test(lifecycle): add manifest-driven all-apps health matrix

The per-app suites cover ~8 core apps in depth; nothing covered the ~30 others
(jellyfin, vaultwarden, penpot, nextcloud, grafana, …). all-apps-matrix.bats
derives the app set from server.get-state package-data (no hardcoded list) and
asserts baseline health across EVERY installed app:
  - settles to a non-transitional state within a window (the #13/#14 stuck-ghost
    class, generalized fleet-wide — installing/removing that never settles)
  - not in error/failed
  - reports a recognized (non-garbage) state
  - every running UI app (manifest ui=="true") exposes a non-null lan-address
    (the immich/port-drift unreachable-UI failure, generalized to all UI apps)

Read-only, so it joins run.sh/run-gate.sh on every node and grows coverage as
nodes install more apps. Verified 5/5 on .228 (17 apps) and .116 (20 apps).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
archipelago 2026-06-24 05:27:10 -04:00
parent 57a69257c4
commit 0406af522c
2 changed files with 149 additions and 0 deletions

View File

@ -222,6 +222,21 @@ so it's safe on a populated node. Override the app via `ARCHY_CASCADE_APP` /
`ARCHY_CASCADE_IMAGE` / `ARCHY_CASCADE_CONFIG` / `ARCHY_CASCADE_DATA_DIR`. `ARCHY_CASCADE_IMAGE` / `ARCHY_CASCADE_CONFIG` / `ARCHY_CASCADE_DATA_DIR`.
Gated on `ARCHY_ALLOW_CASCADE_DESTRUCTIVE=1`. Verified 7/7 on .228 (2026-06-24). Gated on `ARCHY_ALLOW_CASCADE_DESTRUCTIVE=1`. Verified 7/7 on .228 (2026-06-24).
### All-apps lifecycle matrix (Workstream F)
The per-app suites cover ~8 core apps in depth; `all-apps-matrix.bats` covers
**every installed app in breadth, automatically** — it derives the app set from
`server.get-state` package-data (no hardcoded list) and grows coverage as nodes
install more apps. **Read-only**, so it joins `run.sh`/`run-gate.sh` on every node.
| Suite | Guards (fleet-wide) | Asserts (per installed app) |
|---|---|---|
| `all-apps-matrix.bats` | apps STUCK transitional (the #13/#14 ghost generalized), error/failed apps, unreachable UI apps (port-drift generalized) | settles to a non-transitional state within a window; not error/failed; recognized (non-garbage) state; every **running UI app** (manifest `ui=="true"`) exposes a non-null lan-address |
Tunables: `ARCHY_MATRIX_SETTLE_SECS` (45), `ARCHY_MATRIX_UI_SECS` (30),
`ARCHY_MATRIX_ALLOW_STOPPED` (ids allowed non-running). Verified 5/5 on .228
(17 apps) and .116 (20 apps incl. grafana/nextcloud/photoprism/gitea), 2026-06-24.
To exercise the Phase 3.2 Quadlet-backend path on a target node without To exercise the Phase 3.2 Quadlet-backend path on a target node without
editing config.json (which would require an archipelago restart and editing config.json (which would require an archipelago restart and
trigger FM3 until 3.5 ships), set the env var on `archipelago.service`: trigger FM3 until 3.5 ships), set the env var on `archipelago.service`:

View File

@ -0,0 +1,134 @@
#!/usr/bin/env bats
# tests/lifecycle/bats/all-apps-matrix.bats
#
# Manifest-driven, fleet-wide lifecycle health matrix. The per-app suites
# (bitcoin-knots, lnd, mempool, immich, …) cover ~8 core apps in depth; this
# covers EVERY installed app in breadth, automatically — no hardcoded list.
#
# It derives the app set from server.get-state's package-data (the My Apps map)
# and asserts baseline health across all of them. Read-only (no destructive env
# needed), so it joins run.sh / run-gate.sh on every node and grows coverage as
# nodes install more apps.
#
# Catches, fleet-wide, the bug classes the narrow gate missed:
# - apps STUCK in a transitional state (the #13/#14 ghost: installing/removing
# that never settles)
# - apps sitting in error/failed
# - running UI apps with no reachable lan-address (generalized port-drift)
load '../lib/rpc.bash'
# Transitional states are legitimate momentarily but must not PERSIST. Steady:
# running/stopped/exited/created/paused/installed/not-installed.
TRANSITIONAL_RE='^(installing|pulling-image|pulling|downloading|removing|uninstalling|updating|starting|stopping|restarting)$'
BAD_RE='^(error|failed)$'
# Apps whose state is allowed to be non-running at rest (no UI/health expectation
# beyond "settled"). Empty by default; override via ARCHY_MATRIX_ALLOW_STOPPED
# (space-separated ids) on nodes where an app is intentionally left stopped.
ALLOW_STOPPED="${ARCHY_MATRIX_ALLOW_STOPPED:-}"
setup_file() {
: "${ARCHY_PASSWORD:?Set ARCHY_PASSWORD env var to the UI password}"
export ARCHY_FORCE_LOGIN=1
rpc_login
unset ARCHY_FORCE_LOGIN
}
teardown_file() {
rpc_logout_local
}
# Echo the package-data object (the My Apps map) once.
get_package_data() {
rpc_result server.get-state '{}' 2>/dev/null | jq -c '.data["package-data"] // {}'
}
# Space-separated list of installed app ids.
app_ids() {
get_package_data | jq -r 'keys[]'
}
# ────────────────────────────────────────────────────────────────────
@test "matrix has apps to check (get-state returns a non-empty My Apps map)" {
run app_ids
[ "$status" -eq 0 ]
[ -n "$output" ]
echo "# matrix covers $(echo "$output" | wc -w) apps: $(echo $output)" >&3
}
@test "no installed app is STUCK in a transitional state (settles within window)" {
local settle="${ARCHY_MATRIX_SETTLE_SECS:-45}"
local deadline=$(( $(date +%s) + settle ))
local stuck=""
# Re-poll: a transitional state right now may just be a genuine in-progress op,
# so only fail apps that are STILL transitional after the settle window.
while :; do
stuck=""
local pd; pd=$(get_package_data)
for id in $(echo "$pd" | jq -r 'keys[]'); do
local st; st=$(echo "$pd" | jq -r --arg i "$id" '.[$i].state // "unknown"')
[[ "$st" =~ $TRANSITIONAL_RE ]] && stuck+="${id}=${st} "
done
[[ -z "$stuck" ]] && break
(( $(date +%s) >= deadline )) && break
sleep 5
done
[[ -z "$stuck" ]] || { echo "# STUCK transitional after ${settle}s: $stuck" >&3; false; }
}
@test "no installed app is in an error/failed state" {
local pd; pd=$(get_package_data)
local bad=""
for id in $(echo "$pd" | jq -r 'keys[]'); do
local st; st=$(echo "$pd" | jq -r --arg i "$id" '.[$i].state // "unknown"')
[[ "$st" =~ $BAD_RE ]] && bad+="${id}=${st} "
done
[[ -z "$bad" ]] || { echo "# error/failed apps: $bad" >&3; false; }
}
@test "every running app reports a recognized state (no empty/garbage state)" {
local pd; pd=$(get_package_data)
local junk=""
for id in $(echo "$pd" | jq -r 'keys[]'); do
local st; st=$(echo "$pd" | jq -r --arg i "$id" '.[$i].state // "unknown"')
case "$st" in
running|stopped|exited|created|paused|installed|not-installed|\
installing|pulling-image|pulling|downloading|removing|uninstalling|updating|starting|stopping|restarting|\
error|failed|degraded) : ;;
*) junk+="${id}='${st}' " ;;
esac
done
[[ -z "$junk" ]] || { echo "# unrecognized state values: $junk" >&3; false; }
}
@test "every running UI app exposes a lan-address (generalized port-drift)" {
# A running app whose manifest declares a UI interface (ui=="true") must have a
# non-null lan-address on that interface — otherwise its UI is unreachable
# (the immich/port-drift failure mode, asserted across ALL UI apps). Poll
# briefly to absorb the transient null seen while a container is mid-recreate.
local deadline=$(( $(date +%s) + ${ARCHY_MATRIX_UI_SECS:-30} ))
local missing=""
while :; do
missing=""
local pd; pd=$(get_package_data)
for id in $(echo "$pd" | jq -r 'keys[]'); do
local st; st=$(echo "$pd" | jq -r --arg i "$id" '.[$i].state // "unknown"')
[[ "$st" == "running" ]] || continue
# interface keys whose manifest marks ui=="true"
local ui_ifaces
ui_ifaces=$(echo "$pd" | jq -r --arg i "$id" \
'.[$i].manifest.interfaces // {} | to_entries[] | select(.value.ui=="true") | .key')
for k in $ui_ifaces; do
local addr
addr=$(echo "$pd" | jq -r --arg i "$id" --arg k "$k" \
'.[$i].installed["interface-addresses"][$k]["lan-address"] // "null"')
[[ "$addr" == "null" || -z "$addr" ]] && missing+="${id}:${k} "
done
done
[[ -z "$missing" ]] && break
(( $(date +%s) >= deadline )) && break
sleep 3
done
[[ -z "$missing" ]] || { echo "# running UI apps missing lan-address: $missing" >&3; false; }
}