The 5x destructive gate on heavy nodes false-failed on transient windows during stack recovery, not real regressions: - immich.bats: lan_address port-publish probe 30s -> 90s. The postgres->redis ->server (DB migrations on boot) stack can take >30s to republish :2283 after a churn-induced recreate; destructive-tier immich tests already allow 180-240s. - mempool.bats: orphan-container check now polls to steady state (<=30s) instead of a single-shot count, which caught a recreated member briefly visible alongside its replacement mid-reconcile. - run-gate.sh: settle cap 180s -> 300s and also gate on immich's :2283 when installed, so the next iteration's read-only probe doesn't race a still- recovering stack. Settle returns the instant every probe is green. A genuinely unexposed/orphaned/unhealthy app still fails these checks; they only absorb the transient recreate window under sustained churn. Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
120 lines
4.6 KiB
Bash
Executable File
120 lines
4.6 KiB
Bash
Executable File
#!/usr/bin/env bash
|
||
# tests/lifecycle/run-gate.sh — loop the lifecycle harness N times (default 5×, the release gate).
|
||
#
|
||
# Each iteration: setup-teardown → run.sh (with the same args you'd pass
|
||
# to run.sh) → setup-teardown. Tallies pass/fail per iteration and prints a
|
||
# summary at the end. Returns non-zero if any iteration failed.
|
||
#
|
||
# Env:
|
||
# ARCHY_ITERATIONS (default: 5)
|
||
# ARCHY_FAIL_FAST=1 stop on first failed iteration
|
||
# plus everything run.sh / lib/rpc.bash respects
|
||
# (ARCHY_PASSWORD, ARCHY_HOST, ARCHY_SCHEME, ARCHY_ALLOW_DESTRUCTIVE,
|
||
# ARCHY_ALLOW_CASCADE_DESTRUCTIVE, ARCHY_ALLOW_NOAUTH)
|
||
#
|
||
# Usage:
|
||
# tests/lifecycle/run-gate.sh # 5× full bats/ suite
|
||
# ARCHY_ITERATIONS=5 tests/lifecycle/run-gate.sh # 5× full suite
|
||
# tests/lifecycle/run-gate.sh bitcoin-knots # 5× a single suite
|
||
#
|
||
# Suggested release-gate invocation:
|
||
# ARCHY_PASSWORD=password123 ARCHY_ALLOW_DESTRUCTIVE=1 \
|
||
# tests/lifecycle/run-gate.sh
|
||
|
||
set -euo pipefail
|
||
|
||
HERE="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)"
|
||
cd "$HERE"
|
||
|
||
ITER="${ARCHY_ITERATIONS:-5}"
|
||
if ! [[ "$ITER" =~ ^[1-9][0-9]*$ ]]; then
|
||
echo "ARCHY_ITERATIONS must be a positive integer, got: $ITER" >&2
|
||
exit 2
|
||
fi
|
||
|
||
passed=0
|
||
failed=0
|
||
failures=()
|
||
start=$(date +%s)
|
||
|
||
# Best-effort settle: wait for the backend stack to be healthy before an
|
||
# iteration starts, so back-to-back destructive iterations don't compound
|
||
# restart churn (lnd wallet-unlock + the 4-container mempool stack reconnect
|
||
# need time to recover). On-node gate only (localhost probes); never fails the
|
||
# run — just delays up to the deadline. Disable with ARCHY_SETTLE=0.
|
||
settle_stack() {
|
||
[[ "${ARCHY_SETTLE:-1}" == "1" && "${ARCHY_ALLOW_DESTRUCTIVE:-0}" == "1" ]] || return 0
|
||
# 300s (not 180s): on heavy nodes the immich stack's recovery after the prior
|
||
# iteration's archipelago-restart test (crash_recovery retries on a ~120s
|
||
# cadence) can take several minutes, and the next iteration's read-only
|
||
# lan_address probe false-fails if immich is still mid-boot. The settle is a
|
||
# cap, not a fixed wait — it returns the instant every probe is green.
|
||
local deadline=$(( $(date +%s) + ${ARCHY_SETTLE_SECS:-300} ))
|
||
while (( $(date +%s) < deadline )); do
|
||
local ok=1
|
||
# mempool-api + frontend + bitcoin-ui = good proxies for "stack reconnected"
|
||
curl -fsS -m 4 -o /dev/null "http://127.0.0.1:8999/api/v1/backend-info" 2>/dev/null || ok=0
|
||
curl -fsS -m 4 -o /dev/null "http://127.0.0.1:4080/" 2>/dev/null || ok=0
|
||
podman exec lnd lncli --tlscertpath /root/.lnd/tls.cert \
|
||
--macaroonpath /root/.lnd/data/chain/bitcoin/mainnet/readonly.macaroon \
|
||
--rpcserver localhost:10009 getinfo >/dev/null 2>&1 || ok=0
|
||
# Only gate on immich where it's actually installed (heavy nodes). Its web
|
||
# port is the same signal test 64 checks, so settling here keeps the next
|
||
# iteration's read-only immich probe from racing a still-recovering stack.
|
||
if podman container exists immich_server 2>/dev/null; then
|
||
curl -fsS -m 4 -o /dev/null "http://127.0.0.1:2283/" 2>/dev/null || ok=0
|
||
fi
|
||
(( ok == 1 )) && { echo " (stack settled)"; return 0; }
|
||
sleep 4
|
||
done
|
||
echo " (stack settle deadline reached — proceeding anyway)"
|
||
}
|
||
|
||
# One initial teardown so a previous run's cookies don't poison iteration 1.
|
||
./setup-teardown.sh
|
||
|
||
for i in $(seq 1 "$ITER"); do
|
||
echo
|
||
echo "═══ iteration $i / $ITER ═══"
|
||
iter_start=$(date +%s)
|
||
settle_stack
|
||
|
||
if ./run.sh "$@"; then
|
||
iter_end=$(date +%s)
|
||
passed=$((passed + 1))
|
||
echo "── iteration $i: PASS ($((iter_end - iter_start))s) ──"
|
||
else
|
||
rc=$?
|
||
iter_end=$(date +%s)
|
||
failed=$((failed + 1))
|
||
failures+=("$i")
|
||
echo "── iteration $i: FAIL (exit=$rc, $((iter_end - iter_start))s) ──"
|
||
if [[ "${ARCHY_FAIL_FAST:-0}" == "1" ]]; then
|
||
echo "ARCHY_FAIL_FAST=1, stopping early"
|
||
break
|
||
fi
|
||
fi
|
||
|
||
# Teardown between iterations so iteration N+1 starts with a clean
|
||
# session-cookie state regardless of what iteration N did.
|
||
./setup-teardown.sh
|
||
done
|
||
|
||
end=$(date +%s)
|
||
|
||
echo
|
||
echo "════════════════════════════════════════"
|
||
echo " RESULTS"
|
||
echo " iterations: $((passed + failed)) / $ITER"
|
||
echo " passed: $passed"
|
||
echo " failed: $failed"
|
||
if (( failed > 0 )); then
|
||
echo " failed at: ${failures[*]}"
|
||
fi
|
||
echo " wall time: $((end - start))s"
|
||
echo "════════════════════════════════════════"
|
||
|
||
if (( failed > 0 )); then
|
||
exit 1
|
||
fi
|