diff --git a/tests/lifecycle/bats/immich.bats b/tests/lifecycle/bats/immich.bats index fec60fab..fd305642 100644 --- a/tests/lifecycle/bats/immich.bats +++ b/tests/lifecycle/bats/immich.bats @@ -52,7 +52,12 @@ teardown_file() { # health-monitor bounce during the read-only tier). A genuinely unexposed # immich never publishes 2283, so this still catches real port drift; it only # absorbs the transient null seen under churn. - local deadline=$(( $(date +%s) + 30 )) + # 90s (not 30s): the immich stack (postgres→redis→server with DB migrations on + # boot) can take >30s to publish its host port after a churn-induced recreate, + # and the destructive-tier immich tests already allow 180–240s for the same + # stack. A genuinely unexposed immich still never publishes 2283, so this keeps + # catching real port drift while tolerating slow-but-healthy boots. + local deadline=$(( $(date +%s) + 90 )) while (( $(date +%s) < deadline )); do run rpc_result container-list [ "$status" -eq 0 ] @@ -62,7 +67,7 @@ teardown_file() { fi sleep 3 done - echo "immich never reported a lan_address containing 2283 within 30s" >&2 + echo "immich never reported a lan_address containing 2283 within 90s" >&2 return 1 } diff --git a/tests/lifecycle/bats/mempool.bats b/tests/lifecycle/bats/mempool.bats index a345016b..eb87c75e 100644 --- a/tests/lifecycle/bats/mempool.bats +++ b/tests/lifecycle/bats/mempool.bats @@ -75,12 +75,24 @@ mempool_skip_if_absent() { } @test "no orphan mempool-related containers beyond the known set" { - local total known - total=$(podman ps -a --format '{{.Names}}' \ - | grep -Ec '^(mempool|archy-mempool)' || true) - known=$(podman ps -a --format '{{.Names}}' \ - | grep -Ec '^(mempool|mempool-api|archy-mempool-db|archy-mempool-web)$' || true) - [ "$total" -eq "$known" ] + # Poll for steady state (don't single-shot): a stack restart in a prior tier + # briefly leaves a recreated member visible alongside its replacement, so a + # one-shot count can momentarily see total>known even though the reconciler + # converges within seconds. A genuine orphan never clears, so this still + # catches it — it just tolerates the transient recreate window. + local total known deadline=$(( $(date +%s) + 30 )) + while (( $(date +%s) < deadline )); do + total=$(podman ps -a --format '{{.Names}}' \ + | grep -Ec '^(mempool|archy-mempool)' || true) + known=$(podman ps -a --format '{{.Names}}' \ + | grep -Ec '^(mempool|mempool-api|archy-mempool-db|archy-mempool-web)$' || true) + [ "$total" -eq "$known" ] && return 0 + sleep 3 + done + echo "orphan mempool container persisted >30s (total=$total known=$known):" >&2 + podman ps -a --format '{{.Names}}' | grep -E '^(mempool|archy-mempool)' \ + | grep -vE '^(mempool|mempool-api|archy-mempool-db|archy-mempool-web)$' >&2 || true + return 1 } # ──────────────────────────────────────────────────────────────────── diff --git a/tests/lifecycle/run-gate.sh b/tests/lifecycle/run-gate.sh index 9d67bb91..fe1fad7f 100755 --- a/tests/lifecycle/run-gate.sh +++ b/tests/lifecycle/run-gate.sh @@ -44,7 +44,12 @@ start=$(date +%s) # run — just delays up to the deadline. Disable with ARCHY_SETTLE=0. settle_stack() { [[ "${ARCHY_SETTLE:-1}" == "1" && "${ARCHY_ALLOW_DESTRUCTIVE:-0}" == "1" ]] || return 0 - local deadline=$(( $(date +%s) + ${ARCHY_SETTLE_SECS:-180} )) + # 300s (not 180s): on heavy nodes the immich stack's recovery after the prior + # iteration's archipelago-restart test (crash_recovery retries on a ~120s + # cadence) can take several minutes, and the next iteration's read-only + # lan_address probe false-fails if immich is still mid-boot. The settle is a + # cap, not a fixed wait — it returns the instant every probe is green. + local deadline=$(( $(date +%s) + ${ARCHY_SETTLE_SECS:-300} )) while (( $(date +%s) < deadline )); do local ok=1 # mempool-api + frontend + bitcoin-ui = good proxies for "stack reconnected" @@ -53,6 +58,12 @@ settle_stack() { podman exec lnd lncli --tlscertpath /root/.lnd/tls.cert \ --macaroonpath /root/.lnd/data/chain/bitcoin/mainnet/readonly.macaroon \ --rpcserver localhost:10009 getinfo >/dev/null 2>&1 || ok=0 + # Only gate on immich where it's actually installed (heavy nodes). Its web + # port is the same signal test 64 checks, so settling here keeps the next + # iteration's read-only immich probe from racing a still-recovering stack. + if podman container exists immich_server 2>/dev/null; then + curl -fsS -m 4 -o /dev/null "http://127.0.0.1:2283/" 2>/dev/null || ok=0 + fi (( ok == 1 )) && { echo " (stack settled)"; return 0; } sleep 4 done