archy/tests/lifecycle/run-gate.sh
archipelago b7d9210784 test(gate): optional ARCHY_GATE_CASCADE pass — wire the cascade tier in
run-gate.sh ran only the DESTRUCTIVE tier; the cascade-uninstall suite
(uninstall→no-ghost→reinstall, the #13/#14/uninstall-hang regression
guard) existed but was never enabled by the gate. Add an opt-in single
cascade pass after the 5× loop (ARCHY_GATE_CASCADE=1, requires
ARCHY_ALLOW_DESTRUCTIVE=1), counted into the pass/fail tally. Kept out
of the 5× loop deliberately — uninstall/reinstall every iteration would
balloon runtime and re-pull images; one pass guards the class. Default
gate behavior unchanged. Validated: cascade-uninstall.bats 7/7 on .228.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-26 05:22:45 -04:00

148 lines
5.9 KiB
Bash
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bash
# tests/lifecycle/run-gate.sh — loop the lifecycle harness N times (default 5×, the release gate).
#
# Each iteration: setup-teardown → run.sh (with the same args you'd pass
# to run.sh) → setup-teardown. Tallies pass/fail per iteration and prints a
# summary at the end. Returns non-zero if any iteration failed.
#
# Env:
# ARCHY_ITERATIONS (default: 5)
# ARCHY_FAIL_FAST=1 stop on first failed iteration
# ARCHY_GATE_CASCADE=1 after the 5× loop, run ONE cascade pass
# (uninstall→no-ghost→reinstall a throwaway
# app); requires ARCHY_ALLOW_DESTRUCTIVE=1
# plus everything run.sh / lib/rpc.bash respects
# (ARCHY_PASSWORD, ARCHY_HOST, ARCHY_SCHEME, ARCHY_ALLOW_DESTRUCTIVE,
# ARCHY_ALLOW_CASCADE_DESTRUCTIVE, ARCHY_ALLOW_NOAUTH)
#
# Usage:
# tests/lifecycle/run-gate.sh # 5× full bats/ suite
# ARCHY_ITERATIONS=5 tests/lifecycle/run-gate.sh # 5× full suite
# tests/lifecycle/run-gate.sh bitcoin-knots # 5× a single suite
#
# Suggested release-gate invocation:
# ARCHY_PASSWORD=password123 ARCHY_ALLOW_DESTRUCTIVE=1 \
# tests/lifecycle/run-gate.sh
#
# Release-gate WITH the cascade tier (uninstall/reinstall regression guard):
# ARCHY_PASSWORD=password123 ARCHY_ALLOW_DESTRUCTIVE=1 ARCHY_GATE_CASCADE=1 \
# tests/lifecycle/run-gate.sh
set -euo pipefail
HERE="$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" && pwd)"
cd "$HERE"
ITER="${ARCHY_ITERATIONS:-5}"
if ! [[ "$ITER" =~ ^[1-9][0-9]*$ ]]; then
echo "ARCHY_ITERATIONS must be a positive integer, got: $ITER" >&2
exit 2
fi
passed=0
failed=0
failures=()
start=$(date +%s)
# Best-effort settle: wait for the backend stack to be healthy before an
# iteration starts, so back-to-back destructive iterations don't compound
# restart churn (lnd wallet-unlock + the 4-container mempool stack reconnect
# need time to recover). On-node gate only (localhost probes); never fails the
# run — just delays up to the deadline. Disable with ARCHY_SETTLE=0.
settle_stack() {
[[ "${ARCHY_SETTLE:-1}" == "1" && "${ARCHY_ALLOW_DESTRUCTIVE:-0}" == "1" ]] || return 0
# 300s (not 180s): on heavy nodes the immich stack's recovery after the prior
# iteration's archipelago-restart test (crash_recovery retries on a ~120s
# cadence) can take several minutes, and the next iteration's read-only
# lan_address probe false-fails if immich is still mid-boot. The settle is a
# cap, not a fixed wait — it returns the instant every probe is green.
local deadline=$(( $(date +%s) + ${ARCHY_SETTLE_SECS:-300} ))
while (( $(date +%s) < deadline )); do
local ok=1
# mempool-api + frontend + bitcoin-ui = good proxies for "stack reconnected"
curl -fsS -m 4 -o /dev/null "http://127.0.0.1:8999/api/v1/backend-info" 2>/dev/null || ok=0
curl -fsS -m 4 -o /dev/null "http://127.0.0.1:4080/" 2>/dev/null || ok=0
podman exec lnd lncli --tlscertpath /root/.lnd/tls.cert \
--macaroonpath /root/.lnd/data/chain/bitcoin/mainnet/readonly.macaroon \
--rpcserver localhost:10009 getinfo >/dev/null 2>&1 || ok=0
# Only gate on immich where it's actually installed (heavy nodes). Its web
# port is the same signal test 64 checks, so settling here keeps the next
# iteration's read-only immich probe from racing a still-recovering stack.
if podman container exists immich_server 2>/dev/null; then
curl -fsS -m 4 -o /dev/null "http://127.0.0.1:2283/" 2>/dev/null || ok=0
fi
(( ok == 1 )) && { echo " (stack settled)"; return 0; }
sleep 4
done
echo " (stack settle deadline reached — proceeding anyway)"
}
# One initial teardown so a previous run's cookies don't poison iteration 1.
./setup-teardown.sh
for i in $(seq 1 "$ITER"); do
echo
echo "═══ iteration $i / $ITER ═══"
iter_start=$(date +%s)
settle_stack
if ./run.sh "$@"; then
iter_end=$(date +%s)
passed=$((passed + 1))
echo "── iteration $i: PASS ($((iter_end - iter_start))s) ──"
else
rc=$?
iter_end=$(date +%s)
failed=$((failed + 1))
failures+=("$i")
echo "── iteration $i: FAIL (exit=$rc, $((iter_end - iter_start))s) ──"
if [[ "${ARCHY_FAIL_FAST:-0}" == "1" ]]; then
echo "ARCHY_FAIL_FAST=1, stopping early"
break
fi
fi
# Teardown between iterations so iteration N+1 starts with a clean
# session-cookie state regardless of what iteration N did.
./setup-teardown.sh
done
# Optional CASCADE pass — uninstall → no-ghost → reinstall of a throwaway app
# (default grafana, via cascade-uninstall.bats). Run ONCE, not folded into the
# 5× loop on purpose: uninstall/reinstall every iteration would balloon runtime
# and re-pull images. One pass gates the #13 ghost / #14 reinstall-stop /
# uninstall-hang class (the bug fixed in 71cc9ac4). Opt-in so default gate
# behavior is unchanged; counts into the pass/fail tally.
if [[ "${ARCHY_GATE_CASCADE:-0}" == "1" && "${ARCHY_ALLOW_DESTRUCTIVE:-0}" == "1" ]]; then
echo
echo "═══ CASCADE pass (1×) ═══"
settle_stack
if ARCHY_ALLOW_CASCADE_DESTRUCTIVE=1 ./run.sh cascade-uninstall; then
passed=$((passed + 1))
echo "── CASCADE: PASS ──"
else
failed=$((failed + 1))
failures+=("cascade")
echo "── CASCADE: FAIL ──"
fi
./setup-teardown.sh
fi
end=$(date +%s)
echo
echo "════════════════════════════════════════"
echo " RESULTS"
echo " iterations: $((passed + failed)) / $ITER"
echo " passed: $passed"
echo " failed: $failed"
if (( failed > 0 )); then
echo " failed at: ${failures[*]}"
fi
echo " wall time: $((end - start))s"
echo "════════════════════════════════════════"
if (( failed > 0 )); then
exit 1
fi