test(lifecycle): tolerate slow-but-healthy heavy-app recovery under 5x churn

The 5x destructive gate on heavy nodes false-failed on transient windows during stack recovery, not real regressions: - immich.bats: lan_address port-publish probe 30s -> 90s. The postgres->redis ->server (DB migrations on boot) stack can take >30s to republish :2283 after a churn-induced recreate; destructive-tier immich tests already allow 180-240s. - mempool.bats: orphan-container check now polls to steady state (<=30s) instead of a single-shot count, which caught a recreated member briefly visible alongside its replacement mid-reconcile. - run-gate.sh: settle cap 180s -> 300s and also gate on immich's :2283 when installed, so the next iteration's read-only probe doesn't race a still- recovering stack. Settle returns the instant every probe is green. A genuinely unexposed/orphaned/unhealthy app still fails these checks; they only absorb the transient recreate window under sustained churn. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-25 09:18:34 -04:00 · 2026-06-25 09:18:34 -04:00 · 41e7f500f8
commit 41e7f500f8
parent a721532f55
3 changed files with 37 additions and 9 deletions
--- a/tests/lifecycle/bats/immich.bats
+++ b/tests/lifecycle/bats/immich.bats
@ -52,7 +52,12 @@ teardown_file() {
  # health-monitor bounce during the read-only tier). A genuinely unexposed
  # immich never publishes 2283, so this still catches real port drift; it only
  # absorbs the transient null seen under churn.
-  local deadline=$(( $(date +%s) + 30 ))
+  # 90s (not 30s): the immich stack (postgres→redis→server with DB migrations on
+  # boot) can take >30s to publish its host port after a churn-induced recreate,
+  # and the destructive-tier immich tests already allow 180–240s for the same
+  # stack. A genuinely unexposed immich still never publishes 2283, so this keeps
+  # catching real port drift while tolerating slow-but-healthy boots.
+  local deadline=$(( $(date +%s) + 90 ))
  while (( $(date +%s) < deadline )); do
    run rpc_result container-list
    [ "$status" -eq 0 ]
@ -62,7 +67,7 @@ teardown_file() {
    fi
    sleep 3
  done
-  echo "immich never reported a lan_address containing 2283 within 30s" >&2
+  echo "immich never reported a lan_address containing 2283 within 90s" >&2
  return 1
 }

--- a/tests/lifecycle/bats/mempool.bats
+++ b/tests/lifecycle/bats/mempool.bats
@ -75,12 +75,24 @@ mempool_skip_if_absent() {
 }

@test "no orphan mempool-related containers beyond the known set" {
-  local total known
-  total=$(podman ps -a --format '{{.Names}}' \
-    | grep -Ec '^(mempool|archy-mempool)' || true)
-  known=$(podman ps -a --format '{{.Names}}' \
-    | grep -Ec '^(mempool|mempool-api|archy-mempool-db|archy-mempool-web)$' || true)
-  [ "$total" -eq "$known" ]
+  # Poll for steady state (don't single-shot): a stack restart in a prior tier
+  # briefly leaves a recreated member visible alongside its replacement, so a
+  # one-shot count can momentarily see total>known even though the reconciler
+  # converges within seconds. A genuine orphan never clears, so this still
+  # catches it — it just tolerates the transient recreate window.
+  local total known deadline=$(( $(date +%s) + 30 ))
+  while (( $(date +%s) < deadline )); do
+    total=$(podman ps -a --format '{{.Names}}' \
+      | grep -Ec '^(mempool|archy-mempool)' || true)
+    known=$(podman ps -a --format '{{.Names}}' \
+      | grep -Ec '^(mempool|mempool-api|archy-mempool-db|archy-mempool-web)$' || true)
+    [ "$total" -eq "$known" ] && return 0
+    sleep 3
+  done
+  echo "orphan mempool container persisted >30s (total=$total known=$known):" >&2
+  podman ps -a --format '{{.Names}}' | grep -E '^(mempool|archy-mempool)' \
+    | grep -vE '^(mempool|mempool-api|archy-mempool-db|archy-mempool-web)$' >&2 || true
+  return 1
 }

 # ────────────────────────────────────────────────────────────────────
--- a/tests/lifecycle/run-gate.sh
+++ b/tests/lifecycle/run-gate.sh
@ -44,7 +44,12 @@ start=$(date +%s)
 # run — just delays up to the deadline. Disable with ARCHY_SETTLE=0.
 settle_stack() {
  [[ "${ARCHY_SETTLE:-1}" == "1" && "${ARCHY_ALLOW_DESTRUCTIVE:-0}" == "1" ]] || return 0
-  local deadline=$(( $(date +%s) + ${ARCHY_SETTLE_SECS:-180} ))
+  # 300s (not 180s): on heavy nodes the immich stack's recovery after the prior
+  # iteration's archipelago-restart test (crash_recovery retries on a ~120s
+  # cadence) can take several minutes, and the next iteration's read-only
+  # lan_address probe false-fails if immich is still mid-boot. The settle is a
+  # cap, not a fixed wait — it returns the instant every probe is green.
+  local deadline=$(( $(date +%s) + ${ARCHY_SETTLE_SECS:-300} ))
  while (( $(date +%s) < deadline )); do
    local ok=1
    # mempool-api + frontend + bitcoin-ui = good proxies for "stack reconnected"
@ -53,6 +58,12 @@ settle_stack() {
    podman exec lnd lncli --tlscertpath /root/.lnd/tls.cert \
      --macaroonpath /root/.lnd/data/chain/bitcoin/mainnet/readonly.macaroon \
      --rpcserver localhost:10009 getinfo >/dev/null 2>&1 || ok=0
+    # Only gate on immich where it's actually installed (heavy nodes). Its web
+    # port is the same signal test 64 checks, so settling here keeps the next
+    # iteration's read-only immich probe from racing a still-recovering stack.
+    if podman container exists immich_server 2>/dev/null; then
+      curl -fsS -m 4 -o /dev/null "http://127.0.0.1:2283/" 2>/dev/null || ok=0
+    fi
    (( ok == 1 )) && { echo "  (stack settled)"; return 0; }
    sleep 4
  done