test(lifecycle): add manifest-driven all-apps health matrix
The per-app suites cover ~8 core apps in depth; nothing covered the ~30 others
(jellyfin, vaultwarden, penpot, nextcloud, grafana, …). all-apps-matrix.bats
derives the app set from server.get-state package-data (no hardcoded list) and
asserts baseline health across EVERY installed app:
- settles to a non-transitional state within a window (the #13/#14 stuck-ghost
class, generalized fleet-wide — installing/removing that never settles)
- not in error/failed
- reports a recognized (non-garbage) state
- every running UI app (manifest ui=="true") exposes a non-null lan-address
(the immich/port-drift unreachable-UI failure, generalized to all UI apps)
Read-only, so it joins run.sh/run-gate.sh on every node and grows coverage as
nodes install more apps. Verified 5/5 on .228 (17 apps) and .116 (20 apps).
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
57a69257c4
commit
0406af522c
@ -222,6 +222,21 @@ so it's safe on a populated node. Override the app via `ARCHY_CASCADE_APP` /
|
||||
`ARCHY_CASCADE_IMAGE` / `ARCHY_CASCADE_CONFIG` / `ARCHY_CASCADE_DATA_DIR`.
|
||||
Gated on `ARCHY_ALLOW_CASCADE_DESTRUCTIVE=1`. Verified 7/7 on .228 (2026-06-24).
|
||||
|
||||
### All-apps lifecycle matrix (Workstream F)
|
||||
|
||||
The per-app suites cover ~8 core apps in depth; `all-apps-matrix.bats` covers
|
||||
**every installed app in breadth, automatically** — it derives the app set from
|
||||
`server.get-state` package-data (no hardcoded list) and grows coverage as nodes
|
||||
install more apps. **Read-only**, so it joins `run.sh`/`run-gate.sh` on every node.
|
||||
|
||||
| Suite | Guards (fleet-wide) | Asserts (per installed app) |
|
||||
|---|---|---|
|
||||
| `all-apps-matrix.bats` | apps STUCK transitional (the #13/#14 ghost generalized), error/failed apps, unreachable UI apps (port-drift generalized) | settles to a non-transitional state within a window; not error/failed; recognized (non-garbage) state; every **running UI app** (manifest `ui=="true"`) exposes a non-null lan-address |
|
||||
|
||||
Tunables: `ARCHY_MATRIX_SETTLE_SECS` (45), `ARCHY_MATRIX_UI_SECS` (30),
|
||||
`ARCHY_MATRIX_ALLOW_STOPPED` (ids allowed non-running). Verified 5/5 on .228
|
||||
(17 apps) and .116 (20 apps incl. grafana/nextcloud/photoprism/gitea), 2026-06-24.
|
||||
|
||||
To exercise the Phase 3.2 Quadlet-backend path on a target node without
|
||||
editing config.json (which would require an archipelago restart and
|
||||
trigger FM3 until 3.5 ships), set the env var on `archipelago.service`:
|
||||
|
||||
134
tests/lifecycle/bats/all-apps-matrix.bats
Normal file
134
tests/lifecycle/bats/all-apps-matrix.bats
Normal file
@ -0,0 +1,134 @@
|
||||
#!/usr/bin/env bats
|
||||
# tests/lifecycle/bats/all-apps-matrix.bats
|
||||
#
|
||||
# Manifest-driven, fleet-wide lifecycle health matrix. The per-app suites
|
||||
# (bitcoin-knots, lnd, mempool, immich, …) cover ~8 core apps in depth; this
|
||||
# covers EVERY installed app in breadth, automatically — no hardcoded list.
|
||||
#
|
||||
# It derives the app set from server.get-state's package-data (the My Apps map)
|
||||
# and asserts baseline health across all of them. Read-only (no destructive env
|
||||
# needed), so it joins run.sh / run-gate.sh on every node and grows coverage as
|
||||
# nodes install more apps.
|
||||
#
|
||||
# Catches, fleet-wide, the bug classes the narrow gate missed:
|
||||
# - apps STUCK in a transitional state (the #13/#14 ghost: installing/removing
|
||||
# that never settles)
|
||||
# - apps sitting in error/failed
|
||||
# - running UI apps with no reachable lan-address (generalized port-drift)
|
||||
|
||||
load '../lib/rpc.bash'
|
||||
|
||||
# Transitional states are legitimate momentarily but must not PERSIST. Steady:
|
||||
# running/stopped/exited/created/paused/installed/not-installed.
|
||||
TRANSITIONAL_RE='^(installing|pulling-image|pulling|downloading|removing|uninstalling|updating|starting|stopping|restarting)$'
|
||||
BAD_RE='^(error|failed)$'
|
||||
|
||||
# Apps whose state is allowed to be non-running at rest (no UI/health expectation
|
||||
# beyond "settled"). Empty by default; override via ARCHY_MATRIX_ALLOW_STOPPED
|
||||
# (space-separated ids) on nodes where an app is intentionally left stopped.
|
||||
ALLOW_STOPPED="${ARCHY_MATRIX_ALLOW_STOPPED:-}"
|
||||
|
||||
setup_file() {
|
||||
: "${ARCHY_PASSWORD:?Set ARCHY_PASSWORD env var to the UI password}"
|
||||
export ARCHY_FORCE_LOGIN=1
|
||||
rpc_login
|
||||
unset ARCHY_FORCE_LOGIN
|
||||
}
|
||||
|
||||
teardown_file() {
|
||||
rpc_logout_local
|
||||
}
|
||||
|
||||
# Echo the package-data object (the My Apps map) once.
|
||||
get_package_data() {
|
||||
rpc_result server.get-state '{}' 2>/dev/null | jq -c '.data["package-data"] // {}'
|
||||
}
|
||||
|
||||
# Space-separated list of installed app ids.
|
||||
app_ids() {
|
||||
get_package_data | jq -r 'keys[]'
|
||||
}
|
||||
|
||||
# ────────────────────────────────────────────────────────────────────
|
||||
@test "matrix has apps to check (get-state returns a non-empty My Apps map)" {
|
||||
run app_ids
|
||||
[ "$status" -eq 0 ]
|
||||
[ -n "$output" ]
|
||||
echo "# matrix covers $(echo "$output" | wc -w) apps: $(echo $output)" >&3
|
||||
}
|
||||
|
||||
@test "no installed app is STUCK in a transitional state (settles within window)" {
|
||||
local settle="${ARCHY_MATRIX_SETTLE_SECS:-45}"
|
||||
local deadline=$(( $(date +%s) + settle ))
|
||||
local stuck=""
|
||||
# Re-poll: a transitional state right now may just be a genuine in-progress op,
|
||||
# so only fail apps that are STILL transitional after the settle window.
|
||||
while :; do
|
||||
stuck=""
|
||||
local pd; pd=$(get_package_data)
|
||||
for id in $(echo "$pd" | jq -r 'keys[]'); do
|
||||
local st; st=$(echo "$pd" | jq -r --arg i "$id" '.[$i].state // "unknown"')
|
||||
[[ "$st" =~ $TRANSITIONAL_RE ]] && stuck+="${id}=${st} "
|
||||
done
|
||||
[[ -z "$stuck" ]] && break
|
||||
(( $(date +%s) >= deadline )) && break
|
||||
sleep 5
|
||||
done
|
||||
[[ -z "$stuck" ]] || { echo "# STUCK transitional after ${settle}s: $stuck" >&3; false; }
|
||||
}
|
||||
|
||||
@test "no installed app is in an error/failed state" {
|
||||
local pd; pd=$(get_package_data)
|
||||
local bad=""
|
||||
for id in $(echo "$pd" | jq -r 'keys[]'); do
|
||||
local st; st=$(echo "$pd" | jq -r --arg i "$id" '.[$i].state // "unknown"')
|
||||
[[ "$st" =~ $BAD_RE ]] && bad+="${id}=${st} "
|
||||
done
|
||||
[[ -z "$bad" ]] || { echo "# error/failed apps: $bad" >&3; false; }
|
||||
}
|
||||
|
||||
@test "every running app reports a recognized state (no empty/garbage state)" {
|
||||
local pd; pd=$(get_package_data)
|
||||
local junk=""
|
||||
for id in $(echo "$pd" | jq -r 'keys[]'); do
|
||||
local st; st=$(echo "$pd" | jq -r --arg i "$id" '.[$i].state // "unknown"')
|
||||
case "$st" in
|
||||
running|stopped|exited|created|paused|installed|not-installed|\
|
||||
installing|pulling-image|pulling|downloading|removing|uninstalling|updating|starting|stopping|restarting|\
|
||||
error|failed|degraded) : ;;
|
||||
*) junk+="${id}='${st}' " ;;
|
||||
esac
|
||||
done
|
||||
[[ -z "$junk" ]] || { echo "# unrecognized state values: $junk" >&3; false; }
|
||||
}
|
||||
|
||||
@test "every running UI app exposes a lan-address (generalized port-drift)" {
|
||||
# A running app whose manifest declares a UI interface (ui=="true") must have a
|
||||
# non-null lan-address on that interface — otherwise its UI is unreachable
|
||||
# (the immich/port-drift failure mode, asserted across ALL UI apps). Poll
|
||||
# briefly to absorb the transient null seen while a container is mid-recreate.
|
||||
local deadline=$(( $(date +%s) + ${ARCHY_MATRIX_UI_SECS:-30} ))
|
||||
local missing=""
|
||||
while :; do
|
||||
missing=""
|
||||
local pd; pd=$(get_package_data)
|
||||
for id in $(echo "$pd" | jq -r 'keys[]'); do
|
||||
local st; st=$(echo "$pd" | jq -r --arg i "$id" '.[$i].state // "unknown"')
|
||||
[[ "$st" == "running" ]] || continue
|
||||
# interface keys whose manifest marks ui=="true"
|
||||
local ui_ifaces
|
||||
ui_ifaces=$(echo "$pd" | jq -r --arg i "$id" \
|
||||
'.[$i].manifest.interfaces // {} | to_entries[] | select(.value.ui=="true") | .key')
|
||||
for k in $ui_ifaces; do
|
||||
local addr
|
||||
addr=$(echo "$pd" | jq -r --arg i "$id" --arg k "$k" \
|
||||
'.[$i].installed["interface-addresses"][$k]["lan-address"] // "null"')
|
||||
[[ "$addr" == "null" || -z "$addr" ]] && missing+="${id}:${k} "
|
||||
done
|
||||
done
|
||||
[[ -z "$missing" ]] && break
|
||||
(( $(date +%s) >= deadline )) && break
|
||||
sleep 3
|
||||
done
|
||||
[[ -z "$missing" ]] || { echo "# running UI apps missing lan-address: $missing" >&3; false; }
|
||||
}
|
||||
Loading…
x
Reference in New Issue
Block a user