The per-app suites cover ~8 core apps in depth; nothing covered the ~30 others
(jellyfin, vaultwarden, penpot, nextcloud, grafana, …). all-apps-matrix.bats
derives the app set from server.get-state package-data (no hardcoded list) and
asserts baseline health across EVERY installed app:
- settles to a non-transitional state within a window (the #13/#14 stuck-ghost
class, generalized fleet-wide — installing/removing that never settles)
- not in error/failed
- reports a recognized (non-garbage) state
- every running UI app (manifest ui=="true") exposes a non-null lan-address
(the immich/port-drift unreachable-UI failure, generalized to all UI apps)
Read-only, so it joins run.sh/run-gate.sh on every node and grows coverage as
nodes install more apps. Verified 5/5 on .228 (17 apps) and .116 (20 apps).
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
135 lines
5.4 KiB
Bash
135 lines
5.4 KiB
Bash
#!/usr/bin/env bats
|
|
# tests/lifecycle/bats/all-apps-matrix.bats
|
|
#
|
|
# Manifest-driven, fleet-wide lifecycle health matrix. The per-app suites
|
|
# (bitcoin-knots, lnd, mempool, immich, …) cover ~8 core apps in depth; this
|
|
# covers EVERY installed app in breadth, automatically — no hardcoded list.
|
|
#
|
|
# It derives the app set from server.get-state's package-data (the My Apps map)
|
|
# and asserts baseline health across all of them. Read-only (no destructive env
|
|
# needed), so it joins run.sh / run-gate.sh on every node and grows coverage as
|
|
# nodes install more apps.
|
|
#
|
|
# Catches, fleet-wide, the bug classes the narrow gate missed:
|
|
# - apps STUCK in a transitional state (the #13/#14 ghost: installing/removing
|
|
# that never settles)
|
|
# - apps sitting in error/failed
|
|
# - running UI apps with no reachable lan-address (generalized port-drift)
|
|
|
|
load '../lib/rpc.bash'
|
|
|
|
# Transitional states are legitimate momentarily but must not PERSIST. Steady:
|
|
# running/stopped/exited/created/paused/installed/not-installed.
|
|
TRANSITIONAL_RE='^(installing|pulling-image|pulling|downloading|removing|uninstalling|updating|starting|stopping|restarting)$'
|
|
BAD_RE='^(error|failed)$'
|
|
|
|
# Apps whose state is allowed to be non-running at rest (no UI/health expectation
|
|
# beyond "settled"). Empty by default; override via ARCHY_MATRIX_ALLOW_STOPPED
|
|
# (space-separated ids) on nodes where an app is intentionally left stopped.
|
|
ALLOW_STOPPED="${ARCHY_MATRIX_ALLOW_STOPPED:-}"
|
|
|
|
setup_file() {
|
|
: "${ARCHY_PASSWORD:?Set ARCHY_PASSWORD env var to the UI password}"
|
|
export ARCHY_FORCE_LOGIN=1
|
|
rpc_login
|
|
unset ARCHY_FORCE_LOGIN
|
|
}
|
|
|
|
teardown_file() {
|
|
rpc_logout_local
|
|
}
|
|
|
|
# Echo the package-data object (the My Apps map) once.
|
|
get_package_data() {
|
|
rpc_result server.get-state '{}' 2>/dev/null | jq -c '.data["package-data"] // {}'
|
|
}
|
|
|
|
# Space-separated list of installed app ids.
|
|
app_ids() {
|
|
get_package_data | jq -r 'keys[]'
|
|
}
|
|
|
|
# ────────────────────────────────────────────────────────────────────
|
|
@test "matrix has apps to check (get-state returns a non-empty My Apps map)" {
|
|
run app_ids
|
|
[ "$status" -eq 0 ]
|
|
[ -n "$output" ]
|
|
echo "# matrix covers $(echo "$output" | wc -w) apps: $(echo $output)" >&3
|
|
}
|
|
|
|
@test "no installed app is STUCK in a transitional state (settles within window)" {
|
|
local settle="${ARCHY_MATRIX_SETTLE_SECS:-45}"
|
|
local deadline=$(( $(date +%s) + settle ))
|
|
local stuck=""
|
|
# Re-poll: a transitional state right now may just be a genuine in-progress op,
|
|
# so only fail apps that are STILL transitional after the settle window.
|
|
while :; do
|
|
stuck=""
|
|
local pd; pd=$(get_package_data)
|
|
for id in $(echo "$pd" | jq -r 'keys[]'); do
|
|
local st; st=$(echo "$pd" | jq -r --arg i "$id" '.[$i].state // "unknown"')
|
|
[[ "$st" =~ $TRANSITIONAL_RE ]] && stuck+="${id}=${st} "
|
|
done
|
|
[[ -z "$stuck" ]] && break
|
|
(( $(date +%s) >= deadline )) && break
|
|
sleep 5
|
|
done
|
|
[[ -z "$stuck" ]] || { echo "# STUCK transitional after ${settle}s: $stuck" >&3; false; }
|
|
}
|
|
|
|
@test "no installed app is in an error/failed state" {
|
|
local pd; pd=$(get_package_data)
|
|
local bad=""
|
|
for id in $(echo "$pd" | jq -r 'keys[]'); do
|
|
local st; st=$(echo "$pd" | jq -r --arg i "$id" '.[$i].state // "unknown"')
|
|
[[ "$st" =~ $BAD_RE ]] && bad+="${id}=${st} "
|
|
done
|
|
[[ -z "$bad" ]] || { echo "# error/failed apps: $bad" >&3; false; }
|
|
}
|
|
|
|
@test "every running app reports a recognized state (no empty/garbage state)" {
|
|
local pd; pd=$(get_package_data)
|
|
local junk=""
|
|
for id in $(echo "$pd" | jq -r 'keys[]'); do
|
|
local st; st=$(echo "$pd" | jq -r --arg i "$id" '.[$i].state // "unknown"')
|
|
case "$st" in
|
|
running|stopped|exited|created|paused|installed|not-installed|\
|
|
installing|pulling-image|pulling|downloading|removing|uninstalling|updating|starting|stopping|restarting|\
|
|
error|failed|degraded) : ;;
|
|
*) junk+="${id}='${st}' " ;;
|
|
esac
|
|
done
|
|
[[ -z "$junk" ]] || { echo "# unrecognized state values: $junk" >&3; false; }
|
|
}
|
|
|
|
@test "every running UI app exposes a lan-address (generalized port-drift)" {
|
|
# A running app whose manifest declares a UI interface (ui=="true") must have a
|
|
# non-null lan-address on that interface — otherwise its UI is unreachable
|
|
# (the immich/port-drift failure mode, asserted across ALL UI apps). Poll
|
|
# briefly to absorb the transient null seen while a container is mid-recreate.
|
|
local deadline=$(( $(date +%s) + ${ARCHY_MATRIX_UI_SECS:-30} ))
|
|
local missing=""
|
|
while :; do
|
|
missing=""
|
|
local pd; pd=$(get_package_data)
|
|
for id in $(echo "$pd" | jq -r 'keys[]'); do
|
|
local st; st=$(echo "$pd" | jq -r --arg i "$id" '.[$i].state // "unknown"')
|
|
[[ "$st" == "running" ]] || continue
|
|
# interface keys whose manifest marks ui=="true"
|
|
local ui_ifaces
|
|
ui_ifaces=$(echo "$pd" | jq -r --arg i "$id" \
|
|
'.[$i].manifest.interfaces // {} | to_entries[] | select(.value.ui=="true") | .key')
|
|
for k in $ui_ifaces; do
|
|
local addr
|
|
addr=$(echo "$pd" | jq -r --arg i "$id" --arg k "$k" \
|
|
'.[$i].installed["interface-addresses"][$k]["lan-address"] // "null"')
|
|
[[ "$addr" == "null" || -z "$addr" ]] && missing+="${id}:${k} "
|
|
done
|
|
done
|
|
[[ -z "$missing" ]] && break
|
|
(( $(date +%s) >= deadline )) && break
|
|
sleep 3
|
|
done
|
|
[[ -z "$missing" ]] || { echo "# running UI apps missing lan-address: $missing" >&3; false; }
|
|
}
|