From 57a69257c4fc409def6c20c639626d1fa558c902 Mon Sep 17 00:00:00 2001 From: archipelago Date: Wed, 24 Jun 2026 05:13:53 -0400 Subject: [PATCH] test(lifecycle): add CASCADE uninstall/reinstall tier (guards #13 ghost, #14 reinstall) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 5x gate is DESTRUCTIVE-only and never exercised uninstall/reinstall — where the worst field bugs lived (#13 app ghosting in My Apps after uninstall, #14 reinstall stalling on stale state). New cascade-uninstall.bats drives the full teardown path on a throwaway app (default grafana, precondition-skips if already installed so it can't destroy real data) and asserts: - fresh install reaches running via a truthful, non-silent progression - uninstall makes the entry DISAPPEAR from server.get-state package-data (the literal My Apps map) — no ghost, no stuck uninstall stage - container + (on-node) data dir are gone - reinstall returns to running - node left as found Opt-in via ARCHY_ALLOW_CASCADE_DESTRUCTIVE=1; not yet folded into the canonical gate. Verified 7/7 against .228. Co-Authored-By: Claude Opus 4.8 (1M context) --- tests/lifecycle/TESTING.md | 21 +++ tests/lifecycle/bats/cascade-uninstall.bats | 153 ++++++++++++++++++++ 2 files changed, 174 insertions(+) create mode 100644 tests/lifecycle/bats/cascade-uninstall.bats diff --git a/tests/lifecycle/TESTING.md b/tests/lifecycle/TESTING.md index 769a819c..81d39ac0 100644 --- a/tests/lifecycle/TESTING.md +++ b/tests/lifecycle/TESTING.md @@ -199,8 +199,29 @@ ARCHY_PASSWORD=password123 ARCHY_ALLOW_DESTRUCTIVE=1 tests/lifecycle/run.sh # 5× release-gate run: ARCHY_PASSWORD=password123 ARCHY_ALLOW_DESTRUCTIVE=1 ARCHY_ITERATIONS=5 \ tests/lifecycle/run-gate.sh + +# CASCADE tier (uninstall → no-ghost → reinstall) — opt-in, NOT in the canonical +# gate. Installs/uninstalls a THROWAWAY app (default grafana; skips if already +# installed). Run on-node to also assert data-dir removal: +ARCHY_PASSWORD=password123 ARCHY_ALLOW_CASCADE_DESTRUCTIVE=1 \ + tests/lifecycle/run.sh cascade-uninstall ``` +### CASCADE tier — uninstall/reinstall regression guard (Workstream F) + +The 5× gate is DESTRUCTIVE-only (stop/start/restart/survive); it never exercised +uninstall/reinstall, where the worst lifecycle bugs lived. `cascade-uninstall.bats` +closes that gap and encodes the fixes for two field bugs: + +| Suite | Failure it guards | Asserts | +|---|---|---| +| `cascade-uninstall.bats` | **#13 uninstall ghost** (immich/grafana stayed in My Apps after uninstall) and **#14 reinstall stops** (stalled on stale state/data) | fresh install reaches `running` via a truthful (non-silent) progression; uninstall makes the entry **disappear from `server.get-state` package-data** (no ghost, no stuck uninstall stage) + removes the container + (on-node) the data dir; reinstall returns to `running`; node left as found | + +Throwaway-app + precondition-skip (won't touch an app that's already installed), +so it's safe on a populated node. Override the app via `ARCHY_CASCADE_APP` / +`ARCHY_CASCADE_IMAGE` / `ARCHY_CASCADE_CONFIG` / `ARCHY_CASCADE_DATA_DIR`. +Gated on `ARCHY_ALLOW_CASCADE_DESTRUCTIVE=1`. Verified 7/7 on .228 (2026-06-24). + To exercise the Phase 3.2 Quadlet-backend path on a target node without editing config.json (which would require an archipelago restart and trigger FM3 until 3.5 ships), set the env var on `archipelago.service`: diff --git a/tests/lifecycle/bats/cascade-uninstall.bats b/tests/lifecycle/bats/cascade-uninstall.bats new file mode 100644 index 00000000..2d814e78 --- /dev/null +++ b/tests/lifecycle/bats/cascade-uninstall.bats @@ -0,0 +1,153 @@ +#!/usr/bin/env bats +# tests/lifecycle/bats/cascade-uninstall.bats +# +# CASCADE-tier regression guard for the uninstall → reinstall lifecycle — the +# exact bug class the gate's DESTRUCTIVE tier never exercised: +# #13 "uninstall ghost" — app stayed in My Apps after uninstall because the +# package state entry wasn't cleared when teardown hit +# cleanup residue (returned Err before removing it). +# #14 "reinstall stops" — a reinstall stalled partway on the stale state/data +# left behind by the broken uninstall. +# +# Uses a THROWAWAY app (default grafana — not installed on prod/test nodes, no +# user data) so it can drive the FULL teardown path (no preserve_data), which is +# where #13 actually bit. Precondition-skips if the app is already installed, so +# it can NEVER destroy real data on a populated node. +# +# "No ghost" is asserted against server.get-state's package-data (literally the +# My Apps map) — the entry must disappear, not linger with a stale state / +# stuck uninstall stage. +# +# Gated on ARCHY_ALLOW_CASCADE_DESTRUCTIVE=1. RPC-based, so it works on-node or +# against a remote ARCHY_HOST (the data-dir residue check is on-node only). + +load '../lib/rpc.bash' + +CASCADE_APP="${ARCHY_CASCADE_APP:-grafana}" +CASCADE_IMAGE="${ARCHY_CASCADE_IMAGE:-docker.io/grafana/grafana:10.2.0}" +CASCADE_CONFIG="${ARCHY_CASCADE_CONFIG:-{\"ports\":[\"3000:3000\"],\"volumes\":[\"/var/lib/archipelago/grafana:/var/lib/grafana\"],\"env\":[\"GF_PATHS_DATA=/var/lib/grafana\",\"GF_USERS_ALLOW_SIGN_UP=false\"]}}" +CASCADE_DATA_DIR="${ARCHY_CASCADE_DATA_DIR:-/var/lib/archipelago/${CASCADE_APP}}" + +setup_file() { + : "${ARCHY_PASSWORD:?Set ARCHY_PASSWORD env var to the UI password}" + export ARCHY_FORCE_LOGIN=1 + rpc_login + unset ARCHY_FORCE_LOGIN +} + +teardown_file() { + rpc_logout_local +} + +cascade_enabled() { + [[ "${ARCHY_ALLOW_CASCADE_DESTRUCTIVE:-0}" == "1" ]] +} + +# True when CASCADE_APP has an entry in My Apps (server.get-state package-data). +app_in_my_apps() { + rpc_result server.get-state '{}' 2>/dev/null \ + | jq -e --arg id "$CASCADE_APP" '.data["package-data"] | has($id)' >/dev/null 2>&1 +} + +# Top-level state of CASCADE_APP in My Apps, or "absent" when the entry is gone. +app_state() { + rpc_result server.get-state '{}' 2>/dev/null \ + | jq -r --arg id "$CASCADE_APP" '.data["package-data"][$id].state // "absent"' +} + +# Poll My Apps until CASCADE_APP reaches $1 (a state, or "absent"). +wait_app_state() { + local target="$1" timeout="${2:-180}" + local deadline=$(( $(date +%s) + timeout )) + while (( $(date +%s) < deadline )); do + [[ "$(app_state)" == "$target" ]] && return 0 + sleep 3 + done + echo "wait_app_state: $CASCADE_APP never reached '$target' (last='$(app_state)') within ${timeout}s" >&2 + return 1 +} + +# ──────────────────────────────────────────────────────────────────── +@test "cascade gate enabled" { + cascade_enabled || skip "ARCHY_ALLOW_CASCADE_DESTRUCTIVE not set" +} + +@test "precondition: ${CASCADE_APP} is not already installed (protects real data)" { + cascade_enabled || skip "ARCHY_ALLOW_CASCADE_DESTRUCTIVE not set" + if app_in_my_apps; then + skip "${CASCADE_APP} already installed here — refusing to uninstall (would destroy data); set ARCHY_CASCADE_APP to an uninstalled throwaway" + fi +} + +@test "install ${CASCADE_APP} (fresh) reaches running with a truthful, non-silent progression" { + cascade_enabled || skip "ARCHY_ALLOW_CASCADE_DESTRUCTIVE not set" + app_in_my_apps && skip "already installed (precondition skip)" + + run rpc_result package.install "{\"id\":\"${CASCADE_APP}\",\"dockerImage\":\"${CASCADE_IMAGE}\",\"containerConfig\":${CASCADE_CONFIG}}" + [ "$status" -eq 0 ] + + # Progress truthfulness: must pass through a transitional install state (not a + # silent no-op) and land on running. A warm image cache can blow through the + # transitional states between polls, so a missed transitional is a warn, not a + # failure; reaching running is the hard assertion. + local saw_transitional=0 deadline=$(( $(date +%s) + 300 )) + while (( $(date +%s) < deadline )); do + case "$(app_state)" in + installing|pulling-image|pulling|downloading|starting|created) saw_transitional=1 ;; + running) break ;; + esac + sleep 2 + done + [ "$(app_state)" == "running" ] + [ "$saw_transitional" -eq 1 ] || echo "# note: no transitional install state observed (image likely cached)" >&3 +} + +@test "uninstall ${CASCADE_APP} clears it from My Apps — NO ghost (#13)" { + cascade_enabled || skip "ARCHY_ALLOW_CASCADE_DESTRUCTIVE not set" + app_in_my_apps || skip "${CASCADE_APP} not installed (install step must have failed)" + + run rpc_result package.uninstall "{\"id\":\"${CASCADE_APP}\"}" + [ "$status" -eq 0 ] + + # The container must go away… + run wait_for_container_status "$CASCADE_APP" absent 180 + [ "$status" -eq 0 ] + + # …AND the My Apps entry must be GONE — the #13 ghost was the entry lingering + # with a stale state / stuck uninstall stage. Poll: removal trails teardown. + run wait_app_state absent 120 + [ "$status" -eq 0 ] + + # Belt-and-suspenders: the key is truly absent from package-data. + run app_in_my_apps + [ "$status" -ne 0 ] +} + +@test "uninstall removed the data dir (full teardown, no residue)" { + cascade_enabled || skip "ARCHY_ALLOW_CASCADE_DESTRUCTIVE not set" + # Needs the local filesystem — on-node runs only. + case "${ARCHY_HOST:-127.0.0.1}" in + 127.0.0.1|localhost) : ;; + *) skip "data-dir residue check is on-node only (ARCHY_HOST=${ARCHY_HOST})" ;; + esac + [[ ! -e "$CASCADE_DATA_DIR" ]] +} + +@test "reinstall ${CASCADE_APP} returns to running (#14)" { + cascade_enabled || skip "ARCHY_ALLOW_CASCADE_DESTRUCTIVE not set" + + run rpc_result package.install "{\"id\":\"${CASCADE_APP}\",\"dockerImage\":\"${CASCADE_IMAGE}\",\"containerConfig\":${CASCADE_CONFIG}}" + [ "$status" -eq 0 ] + run wait_app_state running 300 + [ "$status" -eq 0 ] +} + +@test "cleanup: uninstall ${CASCADE_APP} to leave the node as found" { + cascade_enabled || skip "ARCHY_ALLOW_CASCADE_DESTRUCTIVE not set" + run rpc_result package.uninstall "{\"id\":\"${CASCADE_APP}\"}" + [ "$status" -eq 0 ] + run wait_for_container_status "$CASCADE_APP" absent 180 + [ "$status" -eq 0 ] + run wait_app_state absent 120 + [ "$status" -eq 0 ] +}