fix(orchestrator): order only live containers on package start/restart

package.restart resolved its container list via
ordered_containers_for_start, which injected every name from the
union startup_order list that wasn't already present — including
variant names not live on a given node (mysql-mempool,
archy-mempool-api, archy-mempool-web). The phantom mysql-mempool is
2nd in the mempool start order, so do_orchestrator_package_start hit
its unknown-app-id fallback, do_package_start failed the inspect
("no such object"), and the `?` aborted the whole start sequence —
leaving mempool-api + the frontend down until the health monitor
recovered them minutes later. That was the source of the 5× gate
flakes #73 (frontend not running in 180s) and #74 (api not queryable
in 300s); root-caused from the .228 journal
("Start failed: mysql-mempool").

Replace the inject-then-sort logic with a pure helper
order_present_containers that orders only the actually-present
containers and never adds phantom entries. startup_order remains a
union of name variants across install generations — it's now used
purely to order what's live, not to inject what isn't. +3 unit tests.

Also harden bitcoin-knots.bats "valid state" probe: poll ≤30s for a
settled state instead of a single-shot read, so a container caught
mid-reconcile (transient restarting/configured) can't flake a 20-min
iteration. A genuinely-stuck container never settles, so real
breakage is still caught.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
archipelago 2026-06-23 02:22:50 -04:00
parent 57a013bc66
commit 92d7f52dd6
2 changed files with 89 additions and 17 deletions

View File

@ -376,16 +376,31 @@ pub(super) fn startup_order(package_id: &str) -> &'static [&'static str] {
/// order for the given app. Unknown containers sort to the end. /// order for the given app. Unknown containers sort to the end.
pub(super) async fn ordered_containers_for_start(package_id: &str) -> Result<Vec<String>> { pub(super) async fn ordered_containers_for_start(package_id: &str) -> Result<Vec<String>> {
let containers = get_containers_for_app(package_id).await?; let containers = get_containers_for_app(package_id).await?;
Ok(order_present_containers(package_id, containers))
}
/// Order the *actually-present* containers of an app by its dependency-aware
/// startup order. Containers whose name is unknown to the order list sort to
/// the end, preserving their relative input order.
///
/// This deliberately does NOT inject order entries that aren't live
/// containers. `startup_order` is a union of container-name variants across
/// install generations (e.g. `mysql-mempool` vs `archy-mempool-db`), so any
/// single install only ever has a subset of those names. Injecting a phantom
/// name makes the start path fail on a "no such object" inspect — and because
/// `do_orchestrator_package_start` propagates the unknown-app-id fallback
/// error via `?`, every later member (the api + frontend) is then skipped,
/// leaving the stack down until the health monitor recovers it minutes later.
/// That was the source of mempool gate flakes #73 (frontend) / #74 (api).
fn order_present_containers(package_id: &str, containers: Vec<String>) -> Vec<String> {
if containers.is_empty() {
// Nothing is live under any known name. Fall back to the package id so
// a single-container app whose container matches its id still gets one
// start attempt; multi-container stacks with no live members are
// surfaced as "no containers" by the caller's emptiness check.
return vec![package_id.to_string()];
}
let order = startup_order(package_id); let order = startup_order(package_id);
if order.is_empty() && containers.is_empty() {
return Ok(vec![package_id.to_string()]);
}
let mut sorted = containers;
for required in order {
if !sorted.iter().any(|name| name == required) {
sorted.push((*required).to_string());
}
}
// If no special order is defined, fall back to mempool order for legacy // If no special order is defined, fall back to mempool order for legacy
// multi-container names that may still be returned by config lookups. // multi-container names that may still be returned by config lookups.
let effective_order: &[&str] = if order.is_empty() { let effective_order: &[&str] = if order.is_empty() {
@ -393,8 +408,14 @@ pub(super) async fn ordered_containers_for_start(package_id: &str) -> Result<Vec
} else { } else {
order order
}; };
sorted.sort_by_key(|c| effective_order.iter().position(|o| *o == c).unwrap_or(99)); let mut sorted = containers;
Ok(sorted) sorted.sort_by_key(|c| {
effective_order
.iter()
.position(|o| *o == c)
.unwrap_or(usize::MAX)
});
sorted
} }
/// Configure Fedimint Gateway to use LND instead of LDK. /// Configure Fedimint Gateway to use LND instead of LDK.
@ -452,7 +473,48 @@ pub(super) fn configure_fedimint_lnd(
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use super::{requires_unpruned_bitcoin, startup_order}; use super::{order_present_containers, requires_unpruned_bitcoin, startup_order};
#[test]
fn order_present_containers_never_injects_phantom_stack_members() {
// The live mempool stack on a node: db + api + frontend. These are the
// only real container names; the startup_order list also contains
// variant/legacy names (mysql-mempool, archy-mempool-api, ...) that are
// NOT live here and must never appear in the result — a phantom name in
// the start list aborts the orchestrator start mid-sequence (gate
// #73/#74).
let present = vec![
"mempool".to_string(),
"mempool-api".to_string(),
"archy-mempool-db".to_string(),
];
let ordered = order_present_containers("mempool", present);
// Dependency order: db -> api -> frontend.
assert_eq!(ordered, vec!["archy-mempool-db", "mempool-api", "mempool"]);
// No phantom variants leaked in.
for phantom in ["mysql-mempool", "archy-mempool-api", "archy-mempool-web"] {
assert!(
!ordered.iter().any(|c| c == phantom),
"phantom {phantom} must not be injected"
);
}
}
#[test]
fn order_present_containers_orders_known_before_unknown() {
let present = vec!["mempool".to_string(), "some-sidecar".to_string()];
let ordered = order_present_containers("mempool", present);
// The known frontend sorts ahead of an unknown sidecar.
assert_eq!(ordered, vec!["mempool", "some-sidecar"]);
}
#[test]
fn order_present_containers_empty_falls_back_to_package_id() {
assert_eq!(
order_present_containers("mempool", vec![]),
vec!["mempool".to_string()]
);
}
#[test] #[test]
fn btcpay_start_order_includes_required_stack_members() { fn btcpay_start_order_includes_required_stack_members() {

View File

@ -36,11 +36,21 @@ teardown_file() {
} }
@test "container-list reports a valid state for bitcoin-knots" { @test "container-list reports a valid state for bitcoin-knots" {
run rpc_result container-list # Poll briefly: a container caught mid-reconcile can momentarily report a
[ "$status" -eq 0 ] # transient state ("restarting"/"configured"/"removing") or no state at all.
local state # A genuinely-stuck container never settles, so this still catches real
state=$(echo "$output" | jq -r '.[] | select(.name == "bitcoin-knots") | .state') # breakage; it only absorbs churn (e.g. another container bouncing right
[[ "$state" =~ ^(running|stopped|exited|created|paused)$ ]] # before the read-only tier runs).
local state="" deadline=$(( $(date +%s) + 30 ))
while (( $(date +%s) < deadline )); do
run rpc_result container-list
[ "$status" -eq 0 ]
state=$(echo "$output" | jq -r '.[] | select(.name == "bitcoin-knots") | .state')
[[ "$state" =~ ^(running|stopped|exited|created|paused)$ ]] && return 0
sleep 3
done
echo "bitcoin-knots never reported a settled valid state within 30s (last: '$state')" >&2
return 1
} }
@test "container-status returns a valid status object for bitcoin-knots" { @test "container-status returns a valid status object for bitcoin-knots" {