diff --git a/core/archipelago/src/api/rpc/package/dependencies.rs b/core/archipelago/src/api/rpc/package/dependencies.rs index 663ae7f4..9cea4e48 100644 --- a/core/archipelago/src/api/rpc/package/dependencies.rs +++ b/core/archipelago/src/api/rpc/package/dependencies.rs @@ -376,16 +376,31 @@ pub(super) fn startup_order(package_id: &str) -> &'static [&'static str] { /// order for the given app. Unknown containers sort to the end. pub(super) async fn ordered_containers_for_start(package_id: &str) -> Result> { let containers = get_containers_for_app(package_id).await?; + Ok(order_present_containers(package_id, containers)) +} + +/// Order the *actually-present* containers of an app by its dependency-aware +/// startup order. Containers whose name is unknown to the order list sort to +/// the end, preserving their relative input order. +/// +/// This deliberately does NOT inject order entries that aren't live +/// containers. `startup_order` is a union of container-name variants across +/// install generations (e.g. `mysql-mempool` vs `archy-mempool-db`), so any +/// single install only ever has a subset of those names. Injecting a phantom +/// name makes the start path fail on a "no such object" inspect — and because +/// `do_orchestrator_package_start` propagates the unknown-app-id fallback +/// error via `?`, every later member (the api + frontend) is then skipped, +/// leaving the stack down until the health monitor recovers it minutes later. +/// That was the source of mempool gate flakes #73 (frontend) / #74 (api). +fn order_present_containers(package_id: &str, containers: Vec) -> Vec { + if containers.is_empty() { + // Nothing is live under any known name. Fall back to the package id so + // a single-container app whose container matches its id still gets one + // start attempt; multi-container stacks with no live members are + // surfaced as "no containers" by the caller's emptiness check. + return vec![package_id.to_string()]; + } let order = startup_order(package_id); - if order.is_empty() && containers.is_empty() { - return Ok(vec![package_id.to_string()]); - } - let mut sorted = containers; - for required in order { - if !sorted.iter().any(|name| name == required) { - sorted.push((*required).to_string()); - } - } // If no special order is defined, fall back to mempool order for legacy // multi-container names that may still be returned by config lookups. let effective_order: &[&str] = if order.is_empty() { @@ -393,8 +408,14 @@ pub(super) async fn ordered_containers_for_start(package_id: &str) -> Result api -> frontend. + assert_eq!(ordered, vec!["archy-mempool-db", "mempool-api", "mempool"]); + // No phantom variants leaked in. + for phantom in ["mysql-mempool", "archy-mempool-api", "archy-mempool-web"] { + assert!( + !ordered.iter().any(|c| c == phantom), + "phantom {phantom} must not be injected" + ); + } + } + + #[test] + fn order_present_containers_orders_known_before_unknown() { + let present = vec!["mempool".to_string(), "some-sidecar".to_string()]; + let ordered = order_present_containers("mempool", present); + // The known frontend sorts ahead of an unknown sidecar. + assert_eq!(ordered, vec!["mempool", "some-sidecar"]); + } + + #[test] + fn order_present_containers_empty_falls_back_to_package_id() { + assert_eq!( + order_present_containers("mempool", vec![]), + vec!["mempool".to_string()] + ); + } #[test] fn btcpay_start_order_includes_required_stack_members() { diff --git a/tests/lifecycle/bats/bitcoin-knots.bats b/tests/lifecycle/bats/bitcoin-knots.bats index 175a3f0e..4024e30d 100644 --- a/tests/lifecycle/bats/bitcoin-knots.bats +++ b/tests/lifecycle/bats/bitcoin-knots.bats @@ -36,11 +36,21 @@ teardown_file() { } @test "container-list reports a valid state for bitcoin-knots" { - run rpc_result container-list - [ "$status" -eq 0 ] - local state - state=$(echo "$output" | jq -r '.[] | select(.name == "bitcoin-knots") | .state') - [[ "$state" =~ ^(running|stopped|exited|created|paused)$ ]] + # Poll briefly: a container caught mid-reconcile can momentarily report a + # transient state ("restarting"/"configured"/"removing") or no state at all. + # A genuinely-stuck container never settles, so this still catches real + # breakage; it only absorbs churn (e.g. another container bouncing right + # before the read-only tier runs). + local state="" deadline=$(( $(date +%s) + 30 )) + while (( $(date +%s) < deadline )); do + run rpc_result container-list + [ "$status" -eq 0 ] + state=$(echo "$output" | jq -r '.[] | select(.name == "bitcoin-knots") | .state') + [[ "$state" =~ ^(running|stopped|exited|created|paused)$ ]] && return 0 + sleep 3 + done + echo "bitcoin-knots never reported a settled valid state within 30s (last: '$state')" >&2 + return 1 } @test "container-status returns a valid status object for bitcoin-knots" {