archy/tests/lifecycle/bats/all-apps-matrix.bats
archipelago 0406af522c test(lifecycle): add manifest-driven all-apps health matrix
The per-app suites cover ~8 core apps in depth; nothing covered the ~30 others
(jellyfin, vaultwarden, penpot, nextcloud, grafana, …). all-apps-matrix.bats
derives the app set from server.get-state package-data (no hardcoded list) and
asserts baseline health across EVERY installed app:
  - settles to a non-transitional state within a window (the #13/#14 stuck-ghost
    class, generalized fleet-wide — installing/removing that never settles)
  - not in error/failed
  - reports a recognized (non-garbage) state
  - every running UI app (manifest ui=="true") exposes a non-null lan-address
    (the immich/port-drift unreachable-UI failure, generalized to all UI apps)

Read-only, so it joins run.sh/run-gate.sh on every node and grows coverage as
nodes install more apps. Verified 5/5 on .228 (17 apps) and .116 (20 apps).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-24 05:27:10 -04:00

135 lines
5.4 KiB
Bash

#!/usr/bin/env bats
# tests/lifecycle/bats/all-apps-matrix.bats
#
# Manifest-driven, fleet-wide lifecycle health matrix. The per-app suites
# (bitcoin-knots, lnd, mempool, immich, …) cover ~8 core apps in depth; this
# covers EVERY installed app in breadth, automatically — no hardcoded list.
#
# It derives the app set from server.get-state's package-data (the My Apps map)
# and asserts baseline health across all of them. Read-only (no destructive env
# needed), so it joins run.sh / run-gate.sh on every node and grows coverage as
# nodes install more apps.
#
# Catches, fleet-wide, the bug classes the narrow gate missed:
# - apps STUCK in a transitional state (the #13/#14 ghost: installing/removing
# that never settles)
# - apps sitting in error/failed
# - running UI apps with no reachable lan-address (generalized port-drift)
load '../lib/rpc.bash'
# Transitional states are legitimate momentarily but must not PERSIST. Steady:
# running/stopped/exited/created/paused/installed/not-installed.
TRANSITIONAL_RE='^(installing|pulling-image|pulling|downloading|removing|uninstalling|updating|starting|stopping|restarting)$'
BAD_RE='^(error|failed)$'
# Apps whose state is allowed to be non-running at rest (no UI/health expectation
# beyond "settled"). Empty by default; override via ARCHY_MATRIX_ALLOW_STOPPED
# (space-separated ids) on nodes where an app is intentionally left stopped.
ALLOW_STOPPED="${ARCHY_MATRIX_ALLOW_STOPPED:-}"
setup_file() {
: "${ARCHY_PASSWORD:?Set ARCHY_PASSWORD env var to the UI password}"
export ARCHY_FORCE_LOGIN=1
rpc_login
unset ARCHY_FORCE_LOGIN
}
teardown_file() {
rpc_logout_local
}
# Echo the package-data object (the My Apps map) once.
get_package_data() {
rpc_result server.get-state '{}' 2>/dev/null | jq -c '.data["package-data"] // {}'
}
# Space-separated list of installed app ids.
app_ids() {
get_package_data | jq -r 'keys[]'
}
# ────────────────────────────────────────────────────────────────────
@test "matrix has apps to check (get-state returns a non-empty My Apps map)" {
run app_ids
[ "$status" -eq 0 ]
[ -n "$output" ]
echo "# matrix covers $(echo "$output" | wc -w) apps: $(echo $output)" >&3
}
@test "no installed app is STUCK in a transitional state (settles within window)" {
local settle="${ARCHY_MATRIX_SETTLE_SECS:-45}"
local deadline=$(( $(date +%s) + settle ))
local stuck=""
# Re-poll: a transitional state right now may just be a genuine in-progress op,
# so only fail apps that are STILL transitional after the settle window.
while :; do
stuck=""
local pd; pd=$(get_package_data)
for id in $(echo "$pd" | jq -r 'keys[]'); do
local st; st=$(echo "$pd" | jq -r --arg i "$id" '.[$i].state // "unknown"')
[[ "$st" =~ $TRANSITIONAL_RE ]] && stuck+="${id}=${st} "
done
[[ -z "$stuck" ]] && break
(( $(date +%s) >= deadline )) && break
sleep 5
done
[[ -z "$stuck" ]] || { echo "# STUCK transitional after ${settle}s: $stuck" >&3; false; }
}
@test "no installed app is in an error/failed state" {
local pd; pd=$(get_package_data)
local bad=""
for id in $(echo "$pd" | jq -r 'keys[]'); do
local st; st=$(echo "$pd" | jq -r --arg i "$id" '.[$i].state // "unknown"')
[[ "$st" =~ $BAD_RE ]] && bad+="${id}=${st} "
done
[[ -z "$bad" ]] || { echo "# error/failed apps: $bad" >&3; false; }
}
@test "every running app reports a recognized state (no empty/garbage state)" {
local pd; pd=$(get_package_data)
local junk=""
for id in $(echo "$pd" | jq -r 'keys[]'); do
local st; st=$(echo "$pd" | jq -r --arg i "$id" '.[$i].state // "unknown"')
case "$st" in
running|stopped|exited|created|paused|installed|not-installed|\
installing|pulling-image|pulling|downloading|removing|uninstalling|updating|starting|stopping|restarting|\
error|failed|degraded) : ;;
*) junk+="${id}='${st}' " ;;
esac
done
[[ -z "$junk" ]] || { echo "# unrecognized state values: $junk" >&3; false; }
}
@test "every running UI app exposes a lan-address (generalized port-drift)" {
# A running app whose manifest declares a UI interface (ui=="true") must have a
# non-null lan-address on that interface — otherwise its UI is unreachable
# (the immich/port-drift failure mode, asserted across ALL UI apps). Poll
# briefly to absorb the transient null seen while a container is mid-recreate.
local deadline=$(( $(date +%s) + ${ARCHY_MATRIX_UI_SECS:-30} ))
local missing=""
while :; do
missing=""
local pd; pd=$(get_package_data)
for id in $(echo "$pd" | jq -r 'keys[]'); do
local st; st=$(echo "$pd" | jq -r --arg i "$id" '.[$i].state // "unknown"')
[[ "$st" == "running" ]] || continue
# interface keys whose manifest marks ui=="true"
local ui_ifaces
ui_ifaces=$(echo "$pd" | jq -r --arg i "$id" \
'.[$i].manifest.interfaces // {} | to_entries[] | select(.value.ui=="true") | .key')
for k in $ui_ifaces; do
local addr
addr=$(echo "$pd" | jq -r --arg i "$id" --arg k "$k" \
'.[$i].installed["interface-addresses"][$k]["lan-address"] // "null"')
[[ "$addr" == "null" || -z "$addr" ]] && missing+="${id}:${k} "
done
done
[[ -z "$missing" ]] && break
(( $(date +%s) >= deadline )) && break
sleep 3
done
[[ -z "$missing" ]] || { echo "# running UI apps missing lan-address: $missing" >&3; false; }
}