From 92d7f52dd6bf9757f624a53e53afb1516be77b85 Mon Sep 17 00:00:00 2001 From: archipelago Date: Tue, 23 Jun 2026 02:22:50 -0400 Subject: [PATCH] fix(orchestrator): order only live containers on package start/restart MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit package.restart resolved its container list via ordered_containers_for_start, which injected every name from the union startup_order list that wasn't already present — including variant names not live on a given node (mysql-mempool, archy-mempool-api, archy-mempool-web). The phantom mysql-mempool is 2nd in the mempool start order, so do_orchestrator_package_start hit its unknown-app-id fallback, do_package_start failed the inspect ("no such object"), and the `?` aborted the whole start sequence — leaving mempool-api + the frontend down until the health monitor recovered them minutes later. That was the source of the 5× gate flakes #73 (frontend not running in 180s) and #74 (api not queryable in 300s); root-caused from the .228 journal ("Start failed: mysql-mempool"). Replace the inject-then-sort logic with a pure helper order_present_containers that orders only the actually-present containers and never adds phantom entries. startup_order remains a union of name variants across install generations — it's now used purely to order what's live, not to inject what isn't. +3 unit tests. Also harden bitcoin-knots.bats "valid state" probe: poll ≤30s for a settled state instead of a single-shot read, so a container caught mid-reconcile (transient restarting/configured) can't flake a 20-min iteration. A genuinely-stuck container never settles, so real breakage is still caught. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../src/api/rpc/package/dependencies.rs | 86 ++++++++++++++++--- tests/lifecycle/bats/bitcoin-knots.bats | 20 +++-- 2 files changed, 89 insertions(+), 17 deletions(-) diff --git a/core/archipelago/src/api/rpc/package/dependencies.rs b/core/archipelago/src/api/rpc/package/dependencies.rs index 663ae7f4..9cea4e48 100644 --- a/core/archipelago/src/api/rpc/package/dependencies.rs +++ b/core/archipelago/src/api/rpc/package/dependencies.rs @@ -376,16 +376,31 @@ pub(super) fn startup_order(package_id: &str) -> &'static [&'static str] { /// order for the given app. Unknown containers sort to the end. pub(super) async fn ordered_containers_for_start(package_id: &str) -> Result> { let containers = get_containers_for_app(package_id).await?; + Ok(order_present_containers(package_id, containers)) +} + +/// Order the *actually-present* containers of an app by its dependency-aware +/// startup order. Containers whose name is unknown to the order list sort to +/// the end, preserving their relative input order. +/// +/// This deliberately does NOT inject order entries that aren't live +/// containers. `startup_order` is a union of container-name variants across +/// install generations (e.g. `mysql-mempool` vs `archy-mempool-db`), so any +/// single install only ever has a subset of those names. Injecting a phantom +/// name makes the start path fail on a "no such object" inspect — and because +/// `do_orchestrator_package_start` propagates the unknown-app-id fallback +/// error via `?`, every later member (the api + frontend) is then skipped, +/// leaving the stack down until the health monitor recovers it minutes later. +/// That was the source of mempool gate flakes #73 (frontend) / #74 (api). +fn order_present_containers(package_id: &str, containers: Vec) -> Vec { + if containers.is_empty() { + // Nothing is live under any known name. Fall back to the package id so + // a single-container app whose container matches its id still gets one + // start attempt; multi-container stacks with no live members are + // surfaced as "no containers" by the caller's emptiness check. + return vec![package_id.to_string()]; + } let order = startup_order(package_id); - if order.is_empty() && containers.is_empty() { - return Ok(vec![package_id.to_string()]); - } - let mut sorted = containers; - for required in order { - if !sorted.iter().any(|name| name == required) { - sorted.push((*required).to_string()); - } - } // If no special order is defined, fall back to mempool order for legacy // multi-container names that may still be returned by config lookups. let effective_order: &[&str] = if order.is_empty() { @@ -393,8 +408,14 @@ pub(super) async fn ordered_containers_for_start(package_id: &str) -> Result api -> frontend. + assert_eq!(ordered, vec!["archy-mempool-db", "mempool-api", "mempool"]); + // No phantom variants leaked in. + for phantom in ["mysql-mempool", "archy-mempool-api", "archy-mempool-web"] { + assert!( + !ordered.iter().any(|c| c == phantom), + "phantom {phantom} must not be injected" + ); + } + } + + #[test] + fn order_present_containers_orders_known_before_unknown() { + let present = vec!["mempool".to_string(), "some-sidecar".to_string()]; + let ordered = order_present_containers("mempool", present); + // The known frontend sorts ahead of an unknown sidecar. + assert_eq!(ordered, vec!["mempool", "some-sidecar"]); + } + + #[test] + fn order_present_containers_empty_falls_back_to_package_id() { + assert_eq!( + order_present_containers("mempool", vec![]), + vec!["mempool".to_string()] + ); + } #[test] fn btcpay_start_order_includes_required_stack_members() { diff --git a/tests/lifecycle/bats/bitcoin-knots.bats b/tests/lifecycle/bats/bitcoin-knots.bats index 175a3f0e..4024e30d 100644 --- a/tests/lifecycle/bats/bitcoin-knots.bats +++ b/tests/lifecycle/bats/bitcoin-knots.bats @@ -36,11 +36,21 @@ teardown_file() { } @test "container-list reports a valid state for bitcoin-knots" { - run rpc_result container-list - [ "$status" -eq 0 ] - local state - state=$(echo "$output" | jq -r '.[] | select(.name == "bitcoin-knots") | .state') - [[ "$state" =~ ^(running|stopped|exited|created|paused)$ ]] + # Poll briefly: a container caught mid-reconcile can momentarily report a + # transient state ("restarting"/"configured"/"removing") or no state at all. + # A genuinely-stuck container never settles, so this still catches real + # breakage; it only absorbs churn (e.g. another container bouncing right + # before the read-only tier runs). + local state="" deadline=$(( $(date +%s) + 30 )) + while (( $(date +%s) < deadline )); do + run rpc_result container-list + [ "$status" -eq 0 ] + state=$(echo "$output" | jq -r '.[] | select(.name == "bitcoin-knots") | .state') + [[ "$state" =~ ^(running|stopped|exited|created|paused)$ ]] && return 0 + sleep 3 + done + echo "bitcoin-knots never reported a settled valid state within 30s (last: '$state')" >&2 + return 1 } @test "container-status returns a valid status object for bitcoin-knots" {