fix(orchestrator): order only live containers on package start/restart
package.restart resolved its container list via
ordered_containers_for_start, which injected every name from the
union startup_order list that wasn't already present — including
variant names not live on a given node (mysql-mempool,
archy-mempool-api, archy-mempool-web). The phantom mysql-mempool is
2nd in the mempool start order, so do_orchestrator_package_start hit
its unknown-app-id fallback, do_package_start failed the inspect
("no such object"), and the `?` aborted the whole start sequence —
leaving mempool-api + the frontend down until the health monitor
recovered them minutes later. That was the source of the 5× gate
flakes #73 (frontend not running in 180s) and #74 (api not queryable
in 300s); root-caused from the .228 journal
("Start failed: mysql-mempool").
Replace the inject-then-sort logic with a pure helper
order_present_containers that orders only the actually-present
containers and never adds phantom entries. startup_order remains a
union of name variants across install generations — it's now used
purely to order what's live, not to inject what isn't. +3 unit tests.
Also harden bitcoin-knots.bats "valid state" probe: poll ≤30s for a
settled state instead of a single-shot read, so a container caught
mid-reconcile (transient restarting/configured) can't flake a 20-min
iteration. A genuinely-stuck container never settles, so real
breakage is still caught.
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
57a013bc66
commit
92d7f52dd6
@ -376,16 +376,31 @@ pub(super) fn startup_order(package_id: &str) -> &'static [&'static str] {
|
||||
/// order for the given app. Unknown containers sort to the end.
|
||||
pub(super) async fn ordered_containers_for_start(package_id: &str) -> Result<Vec<String>> {
|
||||
let containers = get_containers_for_app(package_id).await?;
|
||||
Ok(order_present_containers(package_id, containers))
|
||||
}
|
||||
|
||||
/// Order the *actually-present* containers of an app by its dependency-aware
|
||||
/// startup order. Containers whose name is unknown to the order list sort to
|
||||
/// the end, preserving their relative input order.
|
||||
///
|
||||
/// This deliberately does NOT inject order entries that aren't live
|
||||
/// containers. `startup_order` is a union of container-name variants across
|
||||
/// install generations (e.g. `mysql-mempool` vs `archy-mempool-db`), so any
|
||||
/// single install only ever has a subset of those names. Injecting a phantom
|
||||
/// name makes the start path fail on a "no such object" inspect — and because
|
||||
/// `do_orchestrator_package_start` propagates the unknown-app-id fallback
|
||||
/// error via `?`, every later member (the api + frontend) is then skipped,
|
||||
/// leaving the stack down until the health monitor recovers it minutes later.
|
||||
/// That was the source of mempool gate flakes #73 (frontend) / #74 (api).
|
||||
fn order_present_containers(package_id: &str, containers: Vec<String>) -> Vec<String> {
|
||||
if containers.is_empty() {
|
||||
// Nothing is live under any known name. Fall back to the package id so
|
||||
// a single-container app whose container matches its id still gets one
|
||||
// start attempt; multi-container stacks with no live members are
|
||||
// surfaced as "no containers" by the caller's emptiness check.
|
||||
return vec![package_id.to_string()];
|
||||
}
|
||||
let order = startup_order(package_id);
|
||||
if order.is_empty() && containers.is_empty() {
|
||||
return Ok(vec![package_id.to_string()]);
|
||||
}
|
||||
let mut sorted = containers;
|
||||
for required in order {
|
||||
if !sorted.iter().any(|name| name == required) {
|
||||
sorted.push((*required).to_string());
|
||||
}
|
||||
}
|
||||
// If no special order is defined, fall back to mempool order for legacy
|
||||
// multi-container names that may still be returned by config lookups.
|
||||
let effective_order: &[&str] = if order.is_empty() {
|
||||
@ -393,8 +408,14 @@ pub(super) async fn ordered_containers_for_start(package_id: &str) -> Result<Vec
|
||||
} else {
|
||||
order
|
||||
};
|
||||
sorted.sort_by_key(|c| effective_order.iter().position(|o| *o == c).unwrap_or(99));
|
||||
Ok(sorted)
|
||||
let mut sorted = containers;
|
||||
sorted.sort_by_key(|c| {
|
||||
effective_order
|
||||
.iter()
|
||||
.position(|o| *o == c)
|
||||
.unwrap_or(usize::MAX)
|
||||
});
|
||||
sorted
|
||||
}
|
||||
|
||||
/// Configure Fedimint Gateway to use LND instead of LDK.
|
||||
@ -452,7 +473,48 @@ pub(super) fn configure_fedimint_lnd(
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::{requires_unpruned_bitcoin, startup_order};
|
||||
use super::{order_present_containers, requires_unpruned_bitcoin, startup_order};
|
||||
|
||||
#[test]
|
||||
fn order_present_containers_never_injects_phantom_stack_members() {
|
||||
// The live mempool stack on a node: db + api + frontend. These are the
|
||||
// only real container names; the startup_order list also contains
|
||||
// variant/legacy names (mysql-mempool, archy-mempool-api, ...) that are
|
||||
// NOT live here and must never appear in the result — a phantom name in
|
||||
// the start list aborts the orchestrator start mid-sequence (gate
|
||||
// #73/#74).
|
||||
let present = vec![
|
||||
"mempool".to_string(),
|
||||
"mempool-api".to_string(),
|
||||
"archy-mempool-db".to_string(),
|
||||
];
|
||||
let ordered = order_present_containers("mempool", present);
|
||||
// Dependency order: db -> api -> frontend.
|
||||
assert_eq!(ordered, vec!["archy-mempool-db", "mempool-api", "mempool"]);
|
||||
// No phantom variants leaked in.
|
||||
for phantom in ["mysql-mempool", "archy-mempool-api", "archy-mempool-web"] {
|
||||
assert!(
|
||||
!ordered.iter().any(|c| c == phantom),
|
||||
"phantom {phantom} must not be injected"
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn order_present_containers_orders_known_before_unknown() {
|
||||
let present = vec!["mempool".to_string(), "some-sidecar".to_string()];
|
||||
let ordered = order_present_containers("mempool", present);
|
||||
// The known frontend sorts ahead of an unknown sidecar.
|
||||
assert_eq!(ordered, vec!["mempool", "some-sidecar"]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn order_present_containers_empty_falls_back_to_package_id() {
|
||||
assert_eq!(
|
||||
order_present_containers("mempool", vec![]),
|
||||
vec!["mempool".to_string()]
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn btcpay_start_order_includes_required_stack_members() {
|
||||
|
||||
@ -36,11 +36,21 @@ teardown_file() {
|
||||
}
|
||||
|
||||
@test "container-list reports a valid state for bitcoin-knots" {
|
||||
run rpc_result container-list
|
||||
[ "$status" -eq 0 ]
|
||||
local state
|
||||
state=$(echo "$output" | jq -r '.[] | select(.name == "bitcoin-knots") | .state')
|
||||
[[ "$state" =~ ^(running|stopped|exited|created|paused)$ ]]
|
||||
# Poll briefly: a container caught mid-reconcile can momentarily report a
|
||||
# transient state ("restarting"/"configured"/"removing") or no state at all.
|
||||
# A genuinely-stuck container never settles, so this still catches real
|
||||
# breakage; it only absorbs churn (e.g. another container bouncing right
|
||||
# before the read-only tier runs).
|
||||
local state="" deadline=$(( $(date +%s) + 30 ))
|
||||
while (( $(date +%s) < deadline )); do
|
||||
run rpc_result container-list
|
||||
[ "$status" -eq 0 ]
|
||||
state=$(echo "$output" | jq -r '.[] | select(.name == "bitcoin-knots") | .state')
|
||||
[[ "$state" =~ ^(running|stopped|exited|created|paused)$ ]] && return 0
|
||||
sleep 3
|
||||
done
|
||||
echo "bitcoin-knots never reported a settled valid state within 30s (last: '$state')" >&2
|
||||
return 1
|
||||
}
|
||||
|
||||
@test "container-status returns a valid status object for bitcoin-knots" {
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user