test(lifecycle): tolerate slow-but-healthy heavy-app recovery under 5x churn
The 5x destructive gate on heavy nodes false-failed on transient windows during stack recovery, not real regressions: - immich.bats: lan_address port-publish probe 30s -> 90s. The postgres->redis ->server (DB migrations on boot) stack can take >30s to republish :2283 after a churn-induced recreate; destructive-tier immich tests already allow 180-240s. - mempool.bats: orphan-container check now polls to steady state (<=30s) instead of a single-shot count, which caught a recreated member briefly visible alongside its replacement mid-reconcile. - run-gate.sh: settle cap 180s -> 300s and also gate on immich's :2283 when installed, so the next iteration's read-only probe doesn't race a still- recovering stack. Settle returns the instant every probe is green. A genuinely unexposed/orphaned/unhealthy app still fails these checks; they only absorb the transient recreate window under sustained churn. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
a721532f55
commit
41e7f500f8
@ -52,7 +52,12 @@ teardown_file() {
|
|||||||
# health-monitor bounce during the read-only tier). A genuinely unexposed
|
# health-monitor bounce during the read-only tier). A genuinely unexposed
|
||||||
# immich never publishes 2283, so this still catches real port drift; it only
|
# immich never publishes 2283, so this still catches real port drift; it only
|
||||||
# absorbs the transient null seen under churn.
|
# absorbs the transient null seen under churn.
|
||||||
local deadline=$(( $(date +%s) + 30 ))
|
# 90s (not 30s): the immich stack (postgres→redis→server with DB migrations on
|
||||||
|
# boot) can take >30s to publish its host port after a churn-induced recreate,
|
||||||
|
# and the destructive-tier immich tests already allow 180–240s for the same
|
||||||
|
# stack. A genuinely unexposed immich still never publishes 2283, so this keeps
|
||||||
|
# catching real port drift while tolerating slow-but-healthy boots.
|
||||||
|
local deadline=$(( $(date +%s) + 90 ))
|
||||||
while (( $(date +%s) < deadline )); do
|
while (( $(date +%s) < deadline )); do
|
||||||
run rpc_result container-list
|
run rpc_result container-list
|
||||||
[ "$status" -eq 0 ]
|
[ "$status" -eq 0 ]
|
||||||
@ -62,7 +67,7 @@ teardown_file() {
|
|||||||
fi
|
fi
|
||||||
sleep 3
|
sleep 3
|
||||||
done
|
done
|
||||||
echo "immich never reported a lan_address containing 2283 within 30s" >&2
|
echo "immich never reported a lan_address containing 2283 within 90s" >&2
|
||||||
return 1
|
return 1
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@ -75,12 +75,24 @@ mempool_skip_if_absent() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@test "no orphan mempool-related containers beyond the known set" {
|
@test "no orphan mempool-related containers beyond the known set" {
|
||||||
local total known
|
# Poll for steady state (don't single-shot): a stack restart in a prior tier
|
||||||
total=$(podman ps -a --format '{{.Names}}' \
|
# briefly leaves a recreated member visible alongside its replacement, so a
|
||||||
| grep -Ec '^(mempool|archy-mempool)' || true)
|
# one-shot count can momentarily see total>known even though the reconciler
|
||||||
known=$(podman ps -a --format '{{.Names}}' \
|
# converges within seconds. A genuine orphan never clears, so this still
|
||||||
| grep -Ec '^(mempool|mempool-api|archy-mempool-db|archy-mempool-web)$' || true)
|
# catches it — it just tolerates the transient recreate window.
|
||||||
[ "$total" -eq "$known" ]
|
local total known deadline=$(( $(date +%s) + 30 ))
|
||||||
|
while (( $(date +%s) < deadline )); do
|
||||||
|
total=$(podman ps -a --format '{{.Names}}' \
|
||||||
|
| grep -Ec '^(mempool|archy-mempool)' || true)
|
||||||
|
known=$(podman ps -a --format '{{.Names}}' \
|
||||||
|
| grep -Ec '^(mempool|mempool-api|archy-mempool-db|archy-mempool-web)$' || true)
|
||||||
|
[ "$total" -eq "$known" ] && return 0
|
||||||
|
sleep 3
|
||||||
|
done
|
||||||
|
echo "orphan mempool container persisted >30s (total=$total known=$known):" >&2
|
||||||
|
podman ps -a --format '{{.Names}}' | grep -E '^(mempool|archy-mempool)' \
|
||||||
|
| grep -vE '^(mempool|mempool-api|archy-mempool-db|archy-mempool-web)$' >&2 || true
|
||||||
|
return 1
|
||||||
}
|
}
|
||||||
|
|
||||||
# ────────────────────────────────────────────────────────────────────
|
# ────────────────────────────────────────────────────────────────────
|
||||||
|
|||||||
@ -44,7 +44,12 @@ start=$(date +%s)
|
|||||||
# run — just delays up to the deadline. Disable with ARCHY_SETTLE=0.
|
# run — just delays up to the deadline. Disable with ARCHY_SETTLE=0.
|
||||||
settle_stack() {
|
settle_stack() {
|
||||||
[[ "${ARCHY_SETTLE:-1}" == "1" && "${ARCHY_ALLOW_DESTRUCTIVE:-0}" == "1" ]] || return 0
|
[[ "${ARCHY_SETTLE:-1}" == "1" && "${ARCHY_ALLOW_DESTRUCTIVE:-0}" == "1" ]] || return 0
|
||||||
local deadline=$(( $(date +%s) + ${ARCHY_SETTLE_SECS:-180} ))
|
# 300s (not 180s): on heavy nodes the immich stack's recovery after the prior
|
||||||
|
# iteration's archipelago-restart test (crash_recovery retries on a ~120s
|
||||||
|
# cadence) can take several minutes, and the next iteration's read-only
|
||||||
|
# lan_address probe false-fails if immich is still mid-boot. The settle is a
|
||||||
|
# cap, not a fixed wait — it returns the instant every probe is green.
|
||||||
|
local deadline=$(( $(date +%s) + ${ARCHY_SETTLE_SECS:-300} ))
|
||||||
while (( $(date +%s) < deadline )); do
|
while (( $(date +%s) < deadline )); do
|
||||||
local ok=1
|
local ok=1
|
||||||
# mempool-api + frontend + bitcoin-ui = good proxies for "stack reconnected"
|
# mempool-api + frontend + bitcoin-ui = good proxies for "stack reconnected"
|
||||||
@ -53,6 +58,12 @@ settle_stack() {
|
|||||||
podman exec lnd lncli --tlscertpath /root/.lnd/tls.cert \
|
podman exec lnd lncli --tlscertpath /root/.lnd/tls.cert \
|
||||||
--macaroonpath /root/.lnd/data/chain/bitcoin/mainnet/readonly.macaroon \
|
--macaroonpath /root/.lnd/data/chain/bitcoin/mainnet/readonly.macaroon \
|
||||||
--rpcserver localhost:10009 getinfo >/dev/null 2>&1 || ok=0
|
--rpcserver localhost:10009 getinfo >/dev/null 2>&1 || ok=0
|
||||||
|
# Only gate on immich where it's actually installed (heavy nodes). Its web
|
||||||
|
# port is the same signal test 64 checks, so settling here keeps the next
|
||||||
|
# iteration's read-only immich probe from racing a still-recovering stack.
|
||||||
|
if podman container exists immich_server 2>/dev/null; then
|
||||||
|
curl -fsS -m 4 -o /dev/null "http://127.0.0.1:2283/" 2>/dev/null || ok=0
|
||||||
|
fi
|
||||||
(( ok == 1 )) && { echo " (stack settled)"; return 0; }
|
(( ok == 1 )) && { echo " (stack settled)"; return 0; }
|
||||||
sleep 4
|
sleep 4
|
||||||
done
|
done
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user