test(gate): harden readiness for sustained 5x churn + inter-iteration settle

The 1x gate is green; the 5x failed iters 1-2 on readiness-under-churn (apps DO
recover — lnd synced, mempool just mid-restart when probed — but slower than the
windows when restarted back-to-back). Hardening:
- run-20x.sh: best-effort settle_stack() before each iteration (wait for
  mempool-api/frontend + lnd RPC healthy, 180s, on-node, never fails the run).
- required containers present/running (80/81): wait-loops (180s) not single-shot.
- mempool api/frontend (87/88): retry ~180s not single-shot.
- mempool queryable (74): 60s->180s. lnd restart-running (64): 120s->240s.
  lnd getinfo (60): 90s->240s retry.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
archipelago 2026-06-22 17:11:15 -04:00
parent 22b05de6d9
commit 98f4fa44a8
4 changed files with 50 additions and 14 deletions

View File

@ -53,7 +53,7 @@ teardown_file() {
# lnd's RPC readiness LAGS the container "running" state: after a (re)start the # lnd's RPC readiness LAGS the container "running" state: after a (re)start the
# wallet must auto-unlock before lncli answers, so a single-shot getinfo races # wallet must auto-unlock before lncli answers, so a single-shot getinfo races
# that window and false-fails. Retry until ready (~90s), like a health probe. # that window and false-fails. Retry until ready (~90s), like a health probe.
run sh -lc 'for i in $(seq 1 30); do run sh -lc 'for i in $(seq 1 80); do
podman exec lnd lncli \ podman exec lnd lncli \
--tlscertpath /root/.lnd/tls.cert \ --tlscertpath /root/.lnd/tls.cert \
--macaroonpath /root/.lnd/data/chain/bitcoin/mainnet/readonly.macaroon \ --macaroonpath /root/.lnd/data/chain/bitcoin/mainnet/readonly.macaroon \
@ -92,7 +92,7 @@ teardown_file() {
run rpc_result package.start '{"id":"lnd"}' run rpc_result package.start '{"id":"lnd"}'
[ "$status" -eq 0 ] [ "$status" -eq 0 ]
run wait_for_container_status lnd running 120 run wait_for_container_status lnd running 240
[ "$status" -eq 0 ] [ "$status" -eq 0 ]
} }
@ -102,7 +102,7 @@ teardown_file() {
run rpc_result package.restart '{"id":"lnd"}' run rpc_result package.restart '{"id":"lnd"}'
[ "$status" -eq 0 ] [ "$status" -eq 0 ]
run wait_for_container_status lnd running 120 run wait_for_container_status lnd running 240
[ "$status" -eq 0 ] [ "$status" -eq 0 ]
} }

View File

@ -129,7 +129,7 @@ mempool_skip_if_absent() {
mempool_skip_if_absent mempool_skip_if_absent
# mempool-api on :8999 — same probe required-stack.bats uses for parity. # mempool-api on :8999 — same probe required-stack.bats uses for parity.
local deadline=$(( $(date +%s) + 60 )) local deadline=$(( $(date +%s) + 180 ))
while (( $(date +%s) < deadline )); do while (( $(date +%s) < deadline )); do
if curl -fsS -m 5 "http://127.0.0.1:8999/api/v1/backend-info" >/dev/null 2>&1; then if curl -fsS -m 5 "http://127.0.0.1:8999/api/v1/backend-info" >/dev/null 2>&1; then
return 0 return 0

View File

@ -41,19 +41,31 @@ bitcoin_json() {
} }
@test "required containers are present" { @test "required containers are present" {
local names # Under sustained 5× churn an app may still be mid-restart when this runs;
names="$(podman_names)" # wait for the whole required set rather than single-shot.
for c in "${required_containers[@]}"; do local deadline=$(( $(date +%s) + 180 )) names missing
echo "$names" | grep -Fx "$c" >/dev/null while (( $(date +%s) < deadline )); do
names="$(podman_names)"; missing=""
for c in "${required_containers[@]}"; do
echo "$names" | grep -Fx "$c" >/dev/null || missing="$missing $c"
done
[[ -z "$missing" ]] && return 0
sleep 3
done done
fail "required containers never all present; missing:$missing"
} }
@test "required containers are running" { @test "required containers are running" {
for c in "${required_containers[@]}"; do local deadline=$(( $(date +%s) + 180 )) notrunning
run container_running "$c" while (( $(date +%s) < deadline )); do
[ "$status" -eq 0 ] notrunning=""
[ "$output" = "true" ] for c in "${required_containers[@]}"; do
[[ "$(container_running "$c" 2>/dev/null)" == "true" ]] || notrunning="$notrunning $c"
done
[[ -z "$notrunning" ]] && return 0
sleep 3
done done
fail "required containers never all running; not-running:$notrunning"
} }
@test "bitcoin-knots RPC responds" { @test "bitcoin-knots RPC responds" {
@ -113,12 +125,13 @@ PY
} }
@test "mempool api endpoint responds" { @test "mempool api endpoint responds" {
run curl -fsS "http://127.0.0.1:8999/api/v1/backend-info" # mempool-api reconnects to electrumx after a stack restart — retry ~180s.
run sh -lc 'for i in $(seq 1 60); do curl -fsS -m 5 -o /dev/null "http://127.0.0.1:8999/api/v1/backend-info" && exit 0; sleep 3; done; exit 1'
[ "$status" -eq 0 ] [ "$status" -eq 0 ]
} }
@test "mempool frontend responds" { @test "mempool frontend responds" {
run curl -fsS "http://127.0.0.1:4080/" run sh -lc 'for i in $(seq 1 60); do curl -fsS -m 5 -o /dev/null "http://127.0.0.1:4080/" && exit 0; sleep 3; done; exit 1'
[ "$status" -eq 0 ] [ "$status" -eq 0 ]
} }

View File

@ -37,6 +37,28 @@ failed=0
failures=() failures=()
start=$(date +%s) start=$(date +%s)
# Best-effort settle: wait for the backend stack to be healthy before an
# iteration starts, so back-to-back destructive iterations don't compound
# restart churn (lnd wallet-unlock + the 4-container mempool stack reconnect
# need time to recover). On-node gate only (localhost probes); never fails the
# run — just delays up to the deadline. Disable with ARCHY_SETTLE=0.
settle_stack() {
[[ "${ARCHY_SETTLE:-1}" == "1" && "${ARCHY_ALLOW_DESTRUCTIVE:-0}" == "1" ]] || return 0
local deadline=$(( $(date +%s) + ${ARCHY_SETTLE_SECS:-180} ))
while (( $(date +%s) < deadline )); do
local ok=1
# mempool-api + frontend + bitcoin-ui = good proxies for "stack reconnected"
curl -fsS -m 4 -o /dev/null "http://127.0.0.1:8999/api/v1/backend-info" 2>/dev/null || ok=0
curl -fsS -m 4 -o /dev/null "http://127.0.0.1:4080/" 2>/dev/null || ok=0
podman exec lnd lncli --tlscertpath /root/.lnd/tls.cert \
--macaroonpath /root/.lnd/data/chain/bitcoin/mainnet/readonly.macaroon \
--rpcserver localhost:10009 getinfo >/dev/null 2>&1 || ok=0
(( ok == 1 )) && { echo " (stack settled)"; return 0; }
sleep 4
done
echo " (stack settle deadline reached — proceeding anyway)"
}
# One initial teardown so a previous run's cookies don't poison iteration 1. # One initial teardown so a previous run's cookies don't poison iteration 1.
./setup-teardown.sh ./setup-teardown.sh
@ -44,6 +66,7 @@ for i in $(seq 1 "$ITER"); do
echo echo
echo "═══ iteration $i / $ITER ═══" echo "═══ iteration $i / $ITER ═══"
iter_start=$(date +%s) iter_start=$(date +%s)
settle_stack
if ./run.sh "$@"; then if ./run.sh "$@"; then
iter_end=$(date +%s) iter_end=$(date +%s)