test(lifecycle): tolerate slow-but-healthy heavy-app recovery under 5x churn
The 5x destructive gate on heavy nodes false-failed on transient windows during stack recovery, not real regressions: - immich.bats: lan_address port-publish probe 30s -> 90s. The postgres->redis ->server (DB migrations on boot) stack can take >30s to republish :2283 after a churn-induced recreate; destructive-tier immich tests already allow 180-240s. - mempool.bats: orphan-container check now polls to steady state (<=30s) instead of a single-shot count, which caught a recreated member briefly visible alongside its replacement mid-reconcile. - run-gate.sh: settle cap 180s -> 300s and also gate on immich's :2283 when installed, so the next iteration's read-only probe doesn't race a still- recovering stack. Settle returns the instant every probe is green. A genuinely unexposed/orphaned/unhealthy app still fails these checks; they only absorb the transient recreate window under sustained churn. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
a721532f55
commit
41e7f500f8
@ -52,7 +52,12 @@ teardown_file() {
|
||||
# health-monitor bounce during the read-only tier). A genuinely unexposed
|
||||
# immich never publishes 2283, so this still catches real port drift; it only
|
||||
# absorbs the transient null seen under churn.
|
||||
local deadline=$(( $(date +%s) + 30 ))
|
||||
# 90s (not 30s): the immich stack (postgres→redis→server with DB migrations on
|
||||
# boot) can take >30s to publish its host port after a churn-induced recreate,
|
||||
# and the destructive-tier immich tests already allow 180–240s for the same
|
||||
# stack. A genuinely unexposed immich still never publishes 2283, so this keeps
|
||||
# catching real port drift while tolerating slow-but-healthy boots.
|
||||
local deadline=$(( $(date +%s) + 90 ))
|
||||
while (( $(date +%s) < deadline )); do
|
||||
run rpc_result container-list
|
||||
[ "$status" -eq 0 ]
|
||||
@ -62,7 +67,7 @@ teardown_file() {
|
||||
fi
|
||||
sleep 3
|
||||
done
|
||||
echo "immich never reported a lan_address containing 2283 within 30s" >&2
|
||||
echo "immich never reported a lan_address containing 2283 within 90s" >&2
|
||||
return 1
|
||||
}
|
||||
|
||||
|
||||
@ -75,12 +75,24 @@ mempool_skip_if_absent() {
|
||||
}
|
||||
|
||||
@test "no orphan mempool-related containers beyond the known set" {
|
||||
local total known
|
||||
total=$(podman ps -a --format '{{.Names}}' \
|
||||
| grep -Ec '^(mempool|archy-mempool)' || true)
|
||||
known=$(podman ps -a --format '{{.Names}}' \
|
||||
| grep -Ec '^(mempool|mempool-api|archy-mempool-db|archy-mempool-web)$' || true)
|
||||
[ "$total" -eq "$known" ]
|
||||
# Poll for steady state (don't single-shot): a stack restart in a prior tier
|
||||
# briefly leaves a recreated member visible alongside its replacement, so a
|
||||
# one-shot count can momentarily see total>known even though the reconciler
|
||||
# converges within seconds. A genuine orphan never clears, so this still
|
||||
# catches it — it just tolerates the transient recreate window.
|
||||
local total known deadline=$(( $(date +%s) + 30 ))
|
||||
while (( $(date +%s) < deadline )); do
|
||||
total=$(podman ps -a --format '{{.Names}}' \
|
||||
| grep -Ec '^(mempool|archy-mempool)' || true)
|
||||
known=$(podman ps -a --format '{{.Names}}' \
|
||||
| grep -Ec '^(mempool|mempool-api|archy-mempool-db|archy-mempool-web)$' || true)
|
||||
[ "$total" -eq "$known" ] && return 0
|
||||
sleep 3
|
||||
done
|
||||
echo "orphan mempool container persisted >30s (total=$total known=$known):" >&2
|
||||
podman ps -a --format '{{.Names}}' | grep -E '^(mempool|archy-mempool)' \
|
||||
| grep -vE '^(mempool|mempool-api|archy-mempool-db|archy-mempool-web)$' >&2 || true
|
||||
return 1
|
||||
}
|
||||
|
||||
# ────────────────────────────────────────────────────────────────────
|
||||
|
||||
@ -44,7 +44,12 @@ start=$(date +%s)
|
||||
# run — just delays up to the deadline. Disable with ARCHY_SETTLE=0.
|
||||
settle_stack() {
|
||||
[[ "${ARCHY_SETTLE:-1}" == "1" && "${ARCHY_ALLOW_DESTRUCTIVE:-0}" == "1" ]] || return 0
|
||||
local deadline=$(( $(date +%s) + ${ARCHY_SETTLE_SECS:-180} ))
|
||||
# 300s (not 180s): on heavy nodes the immich stack's recovery after the prior
|
||||
# iteration's archipelago-restart test (crash_recovery retries on a ~120s
|
||||
# cadence) can take several minutes, and the next iteration's read-only
|
||||
# lan_address probe false-fails if immich is still mid-boot. The settle is a
|
||||
# cap, not a fixed wait — it returns the instant every probe is green.
|
||||
local deadline=$(( $(date +%s) + ${ARCHY_SETTLE_SECS:-300} ))
|
||||
while (( $(date +%s) < deadline )); do
|
||||
local ok=1
|
||||
# mempool-api + frontend + bitcoin-ui = good proxies for "stack reconnected"
|
||||
@ -53,6 +58,12 @@ settle_stack() {
|
||||
podman exec lnd lncli --tlscertpath /root/.lnd/tls.cert \
|
||||
--macaroonpath /root/.lnd/data/chain/bitcoin/mainnet/readonly.macaroon \
|
||||
--rpcserver localhost:10009 getinfo >/dev/null 2>&1 || ok=0
|
||||
# Only gate on immich where it's actually installed (heavy nodes). Its web
|
||||
# port is the same signal test 64 checks, so settling here keeps the next
|
||||
# iteration's read-only immich probe from racing a still-recovering stack.
|
||||
if podman container exists immich_server 2>/dev/null; then
|
||||
curl -fsS -m 4 -o /dev/null "http://127.0.0.1:2283/" 2>/dev/null || ok=0
|
||||
fi
|
||||
(( ok == 1 )) && { echo " (stack settled)"; return 0; }
|
||||
sleep 4
|
||||
done
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user