test(lifecycle): tolerate slow-but-healthy heavy-app recovery under 5x churn

The 5x destructive gate on heavy nodes false-failed on transient windows
during stack recovery, not real regressions:

- immich.bats: lan_address port-publish probe 30s -> 90s. The postgres->redis
  ->server (DB migrations on boot) stack can take >30s to republish :2283 after
  a churn-induced recreate; destructive-tier immich tests already allow 180-240s.
- mempool.bats: orphan-container check now polls to steady state (<=30s) instead
  of a single-shot count, which caught a recreated member briefly visible
  alongside its replacement mid-reconcile.
- run-gate.sh: settle cap 180s -> 300s and also gate on immich's :2283 when
  installed, so the next iteration's read-only probe doesn't race a still-
  recovering stack. Settle returns the instant every probe is green.

A genuinely unexposed/orphaned/unhealthy app still fails these checks; they only
absorb the transient recreate window under sustained churn.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
archipelago 2026-06-25 09:18:34 -04:00
parent a721532f55
commit 41e7f500f8
3 changed files with 37 additions and 9 deletions

View File

@ -52,7 +52,12 @@ teardown_file() {
# health-monitor bounce during the read-only tier). A genuinely unexposed
# immich never publishes 2283, so this still catches real port drift; it only
# absorbs the transient null seen under churn.
local deadline=$(( $(date +%s) + 30 ))
# 90s (not 30s): the immich stack (postgres→redis→server with DB migrations on
# boot) can take >30s to publish its host port after a churn-induced recreate,
# and the destructive-tier immich tests already allow 180240s for the same
# stack. A genuinely unexposed immich still never publishes 2283, so this keeps
# catching real port drift while tolerating slow-but-healthy boots.
local deadline=$(( $(date +%s) + 90 ))
while (( $(date +%s) < deadline )); do
run rpc_result container-list
[ "$status" -eq 0 ]
@ -62,7 +67,7 @@ teardown_file() {
fi
sleep 3
done
echo "immich never reported a lan_address containing 2283 within 30s" >&2
echo "immich never reported a lan_address containing 2283 within 90s" >&2
return 1
}

View File

@ -75,12 +75,24 @@ mempool_skip_if_absent() {
}
@test "no orphan mempool-related containers beyond the known set" {
local total known
total=$(podman ps -a --format '{{.Names}}' \
| grep -Ec '^(mempool|archy-mempool)' || true)
known=$(podman ps -a --format '{{.Names}}' \
| grep -Ec '^(mempool|mempool-api|archy-mempool-db|archy-mempool-web)$' || true)
[ "$total" -eq "$known" ]
# Poll for steady state (don't single-shot): a stack restart in a prior tier
# briefly leaves a recreated member visible alongside its replacement, so a
# one-shot count can momentarily see total>known even though the reconciler
# converges within seconds. A genuine orphan never clears, so this still
# catches it — it just tolerates the transient recreate window.
local total known deadline=$(( $(date +%s) + 30 ))
while (( $(date +%s) < deadline )); do
total=$(podman ps -a --format '{{.Names}}' \
| grep -Ec '^(mempool|archy-mempool)' || true)
known=$(podman ps -a --format '{{.Names}}' \
| grep -Ec '^(mempool|mempool-api|archy-mempool-db|archy-mempool-web)$' || true)
[ "$total" -eq "$known" ] && return 0
sleep 3
done
echo "orphan mempool container persisted >30s (total=$total known=$known):" >&2
podman ps -a --format '{{.Names}}' | grep -E '^(mempool|archy-mempool)' \
| grep -vE '^(mempool|mempool-api|archy-mempool-db|archy-mempool-web)$' >&2 || true
return 1
}
# ────────────────────────────────────────────────────────────────────

View File

@ -44,7 +44,12 @@ start=$(date +%s)
# run — just delays up to the deadline. Disable with ARCHY_SETTLE=0.
settle_stack() {
[[ "${ARCHY_SETTLE:-1}" == "1" && "${ARCHY_ALLOW_DESTRUCTIVE:-0}" == "1" ]] || return 0
local deadline=$(( $(date +%s) + ${ARCHY_SETTLE_SECS:-180} ))
# 300s (not 180s): on heavy nodes the immich stack's recovery after the prior
# iteration's archipelago-restart test (crash_recovery retries on a ~120s
# cadence) can take several minutes, and the next iteration's read-only
# lan_address probe false-fails if immich is still mid-boot. The settle is a
# cap, not a fixed wait — it returns the instant every probe is green.
local deadline=$(( $(date +%s) + ${ARCHY_SETTLE_SECS:-300} ))
while (( $(date +%s) < deadline )); do
local ok=1
# mempool-api + frontend + bitcoin-ui = good proxies for "stack reconnected"
@ -53,6 +58,12 @@ settle_stack() {
podman exec lnd lncli --tlscertpath /root/.lnd/tls.cert \
--macaroonpath /root/.lnd/data/chain/bitcoin/mainnet/readonly.macaroon \
--rpcserver localhost:10009 getinfo >/dev/null 2>&1 || ok=0
# Only gate on immich where it's actually installed (heavy nodes). Its web
# port is the same signal test 64 checks, so settling here keeps the next
# iteration's read-only immich probe from racing a still-recovering stack.
if podman container exists immich_server 2>/dev/null; then
curl -fsS -m 4 -o /dev/null "http://127.0.0.1:2283/" 2>/dev/null || ok=0
fi
(( ok == 1 )) && { echo " (stack settled)"; return 0; }
sleep 4
done