From 892ff083c4cd2eabab88f65bdd863311efc218ad Mon Sep 17 00:00:00 2001 From: archipelago Date: Mon, 22 Jun 2026 15:43:51 -0400 Subject: [PATCH] test(gate): fix the last 4 readiness/config false-fails (none are product bugs) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit On a proper on-node .228 run (synced bitcoin, 4-fix binary) the lifecycle matrix is green; these 4 were test-harness issues: - lnd 'recovers after restart' (65): bump retry window 90s->240s. lnd cold-restart recovery (wallet unlock + bitcoind reconnect + graph sync) exceeds 90s on a loaded node but DOES complete (synced_to_chain:true). - bitcoin ui responds (89): retry ~120s instead of single-shot (companion nginx may have just been recreated by the companion-survives test). - probe_app_url (99 lnd proxy + all ui-coverage proxy probes): retry up to 90s for post-restart proxy/UI readiness instead of single-shot. - required endpoints after restart (94): :8081 is nginx-proxy-manager, an OPTIONAL app (not in required_containers) — only assert it when NPM is installed; and make the trailing lncli getinfo a retry. Co-Authored-By: Claude Opus 4.8 (1M context) --- tests/lifecycle/bats/lnd.bats | 6 ++++-- .../bats/required-stack-destructive.bats | 16 +++++++++++++--- tests/lifecycle/bats/required-stack.bats | 5 ++++- tests/lifecycle/lib/ui-probes.bash | 10 ++++++++++ 4 files changed, 31 insertions(+), 6 deletions(-) diff --git a/tests/lifecycle/bats/lnd.bats b/tests/lifecycle/bats/lnd.bats index ddd9d3e5..a3635d62 100644 --- a/tests/lifecycle/bats/lnd.bats +++ b/tests/lifecycle/bats/lnd.bats @@ -110,8 +110,10 @@ teardown_file() { [[ "${ARCHY_ALLOW_DESTRUCTIVE:-0}" == "1" ]] || skip "ARCHY_ALLOW_DESTRUCTIVE not set" # lnd takes longer than bitcoind to accept RPC after cold restart because - # the wallet has to be unlocked first. Give it 90s. - local deadline=$(( $(date +%s) + 90 )) + # the wallet has to be unlocked first, then it reconnects to bitcoind and + # re-syncs the graph. On a loaded node this exceeds 90s (observed ~2min on + # .228, then synced_to_chain:true). Give it 240s. + local deadline=$(( $(date +%s) + 240 )) while (( $(date +%s) < deadline )); do if sh -lc 'podman exec lnd lncli \ --tlscertpath /root/.lnd/tls.cert \ diff --git a/tests/lifecycle/bats/required-stack-destructive.bats b/tests/lifecycle/bats/required-stack-destructive.bats index 0d042c63..b2666ee4 100755 --- a/tests/lifecycle/bats/required-stack-destructive.bats +++ b/tests/lifecycle/bats/required-stack-destructive.bats @@ -74,8 +74,13 @@ restart_with_retry() { run wait_http_ok "http://127.0.0.1:8334/" 180 [ "$status" -eq 0 ] - run wait_http_ok "http://127.0.0.1:8081/" 180 - [ "$status" -eq 0 ] + # :8081 is nginx-proxy-manager — an OPTIONAL app (not in required_containers). + # Only assert it when NPM is actually installed on this node; otherwise the + # required-endpoints check false-fails on nodes that don't run NPM. + if podman ps --format '{{.Names}}' | grep -q '^nginx-proxy-manager$'; then + run wait_http_ok "http://127.0.0.1:8081/" 180 + [ "$status" -eq 0 ] + fi run wait_http_ok "http://127.0.0.1:4080/" 180 [ "$status" -eq 0 ] @@ -83,6 +88,11 @@ restart_with_retry() { run wait_http_ok "http://127.0.0.1:8999/api/v1/backend-info" 240 [ "$status" -eq 0 ] - run sh -lc 'podman exec lnd lncli --tlscertpath /root/.lnd/tls.cert --macaroonpath /root/.lnd/data/chain/bitcoin/mainnet/readonly.macaroon --rpcserver localhost:10009 getinfo >/dev/null' + # lnd RPC readiness lags container 'running' (wallet unlock + graph sync) — + # retry rather than single-shot. See lnd.bats. + run sh -lc 'for i in $(seq 1 60); do + podman exec lnd lncli --tlscertpath /root/.lnd/tls.cert --macaroonpath /root/.lnd/data/chain/bitcoin/mainnet/readonly.macaroon --rpcserver localhost:10009 getinfo >/dev/null 2>&1 && exit 0 + sleep 3 + done; exit 1' [ "$status" -eq 0 ] } diff --git a/tests/lifecycle/bats/required-stack.bats b/tests/lifecycle/bats/required-stack.bats index 33474ba3..8674c0c3 100644 --- a/tests/lifecycle/bats/required-stack.bats +++ b/tests/lifecycle/bats/required-stack.bats @@ -123,7 +123,10 @@ PY } @test "bitcoin ui responds" { - run curl -fsS "http://127.0.0.1:8334/" + # The companion (archy-bitcoin-ui) may have just been recreated by an earlier + # companion-survives test; its nginx takes a moment to serve. Retry ~120s + # rather than single-shot. + run sh -lc 'for i in $(seq 1 40); do curl -fsS -o /dev/null "http://127.0.0.1:8334/" && exit 0; sleep 3; done; exit 1' [ "$status" -eq 0 ] } diff --git a/tests/lifecycle/lib/ui-probes.bash b/tests/lifecycle/lib/ui-probes.bash index a480b7cf..038c6892 100644 --- a/tests/lifecycle/lib/ui-probes.bash +++ b/tests/lifecycle/lib/ui-probes.bash @@ -65,6 +65,16 @@ probe_app_url() { if ! probe_container_running "$container"; then skip "$label: backing container '$container' is not running" fi + # An app's proxy/UI takes time to serve 200 after a (re)start — the backend + # may still be unlocking/syncing (lnd) and the companion nginx reloading. + # Retry up to ~90s rather than single-shot, so a readiness race isn't a fail. + local deadline=$(( $(date +%s) + 90 )) + while (( $(date +%s) < deadline )); do + if probe_https_200 "$url" "$label"; then + return 0 + fi + sleep 3 + done run probe_https_200 "$url" "$label" [ "$status" -eq 0 ] }