test(lifecycle): add CASCADE uninstall/reinstall tier (guards #13 ghost, #14 reinstall)

The 5x gate is DESTRUCTIVE-only and never exercised uninstall/reinstall — where
the worst field bugs lived (#13 app ghosting in My Apps after uninstall, #14
reinstall stalling on stale state). New cascade-uninstall.bats drives the full
teardown path on a throwaway app (default grafana, precondition-skips if already
installed so it can't destroy real data) and asserts:
  - fresh install reaches running via a truthful, non-silent progression
  - uninstall makes the entry DISAPPEAR from server.get-state package-data
    (the literal My Apps map) — no ghost, no stuck uninstall stage
  - container + (on-node) data dir are gone
  - reinstall returns to running
  - node left as found

Opt-in via ARCHY_ALLOW_CASCADE_DESTRUCTIVE=1; not yet folded into the canonical
gate. Verified 7/7 against .228.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
archipelago 2026-06-24 05:13:53 -04:00
parent d1cd42c821
commit 57a69257c4
2 changed files with 174 additions and 0 deletions

View File

@ -199,8 +199,29 @@ ARCHY_PASSWORD=password123 ARCHY_ALLOW_DESTRUCTIVE=1 tests/lifecycle/run.sh
# 5× release-gate run: # 5× release-gate run:
ARCHY_PASSWORD=password123 ARCHY_ALLOW_DESTRUCTIVE=1 ARCHY_ITERATIONS=5 \ ARCHY_PASSWORD=password123 ARCHY_ALLOW_DESTRUCTIVE=1 ARCHY_ITERATIONS=5 \
tests/lifecycle/run-gate.sh tests/lifecycle/run-gate.sh
# CASCADE tier (uninstall → no-ghost → reinstall) — opt-in, NOT in the canonical
# gate. Installs/uninstalls a THROWAWAY app (default grafana; skips if already
# installed). Run on-node to also assert data-dir removal:
ARCHY_PASSWORD=password123 ARCHY_ALLOW_CASCADE_DESTRUCTIVE=1 \
tests/lifecycle/run.sh cascade-uninstall
``` ```
### CASCADE tier — uninstall/reinstall regression guard (Workstream F)
The 5× gate is DESTRUCTIVE-only (stop/start/restart/survive); it never exercised
uninstall/reinstall, where the worst lifecycle bugs lived. `cascade-uninstall.bats`
closes that gap and encodes the fixes for two field bugs:
| Suite | Failure it guards | Asserts |
|---|---|---|
| `cascade-uninstall.bats` | **#13 uninstall ghost** (immich/grafana stayed in My Apps after uninstall) and **#14 reinstall stops** (stalled on stale state/data) | fresh install reaches `running` via a truthful (non-silent) progression; uninstall makes the entry **disappear from `server.get-state` package-data** (no ghost, no stuck uninstall stage) + removes the container + (on-node) the data dir; reinstall returns to `running`; node left as found |
Throwaway-app + precondition-skip (won't touch an app that's already installed),
so it's safe on a populated node. Override the app via `ARCHY_CASCADE_APP` /
`ARCHY_CASCADE_IMAGE` / `ARCHY_CASCADE_CONFIG` / `ARCHY_CASCADE_DATA_DIR`.
Gated on `ARCHY_ALLOW_CASCADE_DESTRUCTIVE=1`. Verified 7/7 on .228 (2026-06-24).
To exercise the Phase 3.2 Quadlet-backend path on a target node without To exercise the Phase 3.2 Quadlet-backend path on a target node without
editing config.json (which would require an archipelago restart and editing config.json (which would require an archipelago restart and
trigger FM3 until 3.5 ships), set the env var on `archipelago.service`: trigger FM3 until 3.5 ships), set the env var on `archipelago.service`:

View File

@ -0,0 +1,153 @@
#!/usr/bin/env bats
# tests/lifecycle/bats/cascade-uninstall.bats
#
# CASCADE-tier regression guard for the uninstall → reinstall lifecycle — the
# exact bug class the gate's DESTRUCTIVE tier never exercised:
# #13 "uninstall ghost" — app stayed in My Apps after uninstall because the
# package state entry wasn't cleared when teardown hit
# cleanup residue (returned Err before removing it).
# #14 "reinstall stops" — a reinstall stalled partway on the stale state/data
# left behind by the broken uninstall.
#
# Uses a THROWAWAY app (default grafana — not installed on prod/test nodes, no
# user data) so it can drive the FULL teardown path (no preserve_data), which is
# where #13 actually bit. Precondition-skips if the app is already installed, so
# it can NEVER destroy real data on a populated node.
#
# "No ghost" is asserted against server.get-state's package-data (literally the
# My Apps map) — the entry must disappear, not linger with a stale state /
# stuck uninstall stage.
#
# Gated on ARCHY_ALLOW_CASCADE_DESTRUCTIVE=1. RPC-based, so it works on-node or
# against a remote ARCHY_HOST (the data-dir residue check is on-node only).
load '../lib/rpc.bash'
CASCADE_APP="${ARCHY_CASCADE_APP:-grafana}"
CASCADE_IMAGE="${ARCHY_CASCADE_IMAGE:-docker.io/grafana/grafana:10.2.0}"
CASCADE_CONFIG="${ARCHY_CASCADE_CONFIG:-{\"ports\":[\"3000:3000\"],\"volumes\":[\"/var/lib/archipelago/grafana:/var/lib/grafana\"],\"env\":[\"GF_PATHS_DATA=/var/lib/grafana\",\"GF_USERS_ALLOW_SIGN_UP=false\"]}}"
CASCADE_DATA_DIR="${ARCHY_CASCADE_DATA_DIR:-/var/lib/archipelago/${CASCADE_APP}}"
setup_file() {
: "${ARCHY_PASSWORD:?Set ARCHY_PASSWORD env var to the UI password}"
export ARCHY_FORCE_LOGIN=1
rpc_login
unset ARCHY_FORCE_LOGIN
}
teardown_file() {
rpc_logout_local
}
cascade_enabled() {
[[ "${ARCHY_ALLOW_CASCADE_DESTRUCTIVE:-0}" == "1" ]]
}
# True when CASCADE_APP has an entry in My Apps (server.get-state package-data).
app_in_my_apps() {
rpc_result server.get-state '{}' 2>/dev/null \
| jq -e --arg id "$CASCADE_APP" '.data["package-data"] | has($id)' >/dev/null 2>&1
}
# Top-level state of CASCADE_APP in My Apps, or "absent" when the entry is gone.
app_state() {
rpc_result server.get-state '{}' 2>/dev/null \
| jq -r --arg id "$CASCADE_APP" '.data["package-data"][$id].state // "absent"'
}
# Poll My Apps until CASCADE_APP reaches $1 (a state, or "absent").
wait_app_state() {
local target="$1" timeout="${2:-180}"
local deadline=$(( $(date +%s) + timeout ))
while (( $(date +%s) < deadline )); do
[[ "$(app_state)" == "$target" ]] && return 0
sleep 3
done
echo "wait_app_state: $CASCADE_APP never reached '$target' (last='$(app_state)') within ${timeout}s" >&2
return 1
}
# ────────────────────────────────────────────────────────────────────
@test "cascade gate enabled" {
cascade_enabled || skip "ARCHY_ALLOW_CASCADE_DESTRUCTIVE not set"
}
@test "precondition: ${CASCADE_APP} is not already installed (protects real data)" {
cascade_enabled || skip "ARCHY_ALLOW_CASCADE_DESTRUCTIVE not set"
if app_in_my_apps; then
skip "${CASCADE_APP} already installed here — refusing to uninstall (would destroy data); set ARCHY_CASCADE_APP to an uninstalled throwaway"
fi
}
@test "install ${CASCADE_APP} (fresh) reaches running with a truthful, non-silent progression" {
cascade_enabled || skip "ARCHY_ALLOW_CASCADE_DESTRUCTIVE not set"
app_in_my_apps && skip "already installed (precondition skip)"
run rpc_result package.install "{\"id\":\"${CASCADE_APP}\",\"dockerImage\":\"${CASCADE_IMAGE}\",\"containerConfig\":${CASCADE_CONFIG}}"
[ "$status" -eq 0 ]
# Progress truthfulness: must pass through a transitional install state (not a
# silent no-op) and land on running. A warm image cache can blow through the
# transitional states between polls, so a missed transitional is a warn, not a
# failure; reaching running is the hard assertion.
local saw_transitional=0 deadline=$(( $(date +%s) + 300 ))
while (( $(date +%s) < deadline )); do
case "$(app_state)" in
installing|pulling-image|pulling|downloading|starting|created) saw_transitional=1 ;;
running) break ;;
esac
sleep 2
done
[ "$(app_state)" == "running" ]
[ "$saw_transitional" -eq 1 ] || echo "# note: no transitional install state observed (image likely cached)" >&3
}
@test "uninstall ${CASCADE_APP} clears it from My Apps — NO ghost (#13)" {
cascade_enabled || skip "ARCHY_ALLOW_CASCADE_DESTRUCTIVE not set"
app_in_my_apps || skip "${CASCADE_APP} not installed (install step must have failed)"
run rpc_result package.uninstall "{\"id\":\"${CASCADE_APP}\"}"
[ "$status" -eq 0 ]
# The container must go away…
run wait_for_container_status "$CASCADE_APP" absent 180
[ "$status" -eq 0 ]
# …AND the My Apps entry must be GONE — the #13 ghost was the entry lingering
# with a stale state / stuck uninstall stage. Poll: removal trails teardown.
run wait_app_state absent 120
[ "$status" -eq 0 ]
# Belt-and-suspenders: the key is truly absent from package-data.
run app_in_my_apps
[ "$status" -ne 0 ]
}
@test "uninstall removed the data dir (full teardown, no residue)" {
cascade_enabled || skip "ARCHY_ALLOW_CASCADE_DESTRUCTIVE not set"
# Needs the local filesystem — on-node runs only.
case "${ARCHY_HOST:-127.0.0.1}" in
127.0.0.1|localhost) : ;;
*) skip "data-dir residue check is on-node only (ARCHY_HOST=${ARCHY_HOST})" ;;
esac
[[ ! -e "$CASCADE_DATA_DIR" ]]
}
@test "reinstall ${CASCADE_APP} returns to running (#14)" {
cascade_enabled || skip "ARCHY_ALLOW_CASCADE_DESTRUCTIVE not set"
run rpc_result package.install "{\"id\":\"${CASCADE_APP}\",\"dockerImage\":\"${CASCADE_IMAGE}\",\"containerConfig\":${CASCADE_CONFIG}}"
[ "$status" -eq 0 ]
run wait_app_state running 300
[ "$status" -eq 0 ]
}
@test "cleanup: uninstall ${CASCADE_APP} to leave the node as found" {
cascade_enabled || skip "ARCHY_ALLOW_CASCADE_DESTRUCTIVE not set"
run rpc_result package.uninstall "{\"id\":\"${CASCADE_APP}\"}"
[ "$status" -eq 0 ]
run wait_for_container_status "$CASCADE_APP" absent 180
[ "$status" -eq 0 ]
run wait_app_state absent 120
[ "$status" -eq 0 ]
}