fix(orchestrator): order only live containers on package start/restart
package.restart resolved its container list via
ordered_containers_for_start, which injected every name from the
union startup_order list that wasn't already present — including
variant names not live on a given node (mysql-mempool,
archy-mempool-api, archy-mempool-web). The phantom mysql-mempool is
2nd in the mempool start order, so do_orchestrator_package_start hit
its unknown-app-id fallback, do_package_start failed the inspect
("no such object"), and the `?` aborted the whole start sequence —
leaving mempool-api + the frontend down until the health monitor
recovered them minutes later. That was the source of the 5× gate
flakes #73 (frontend not running in 180s) and #74 (api not queryable
in 300s); root-caused from the .228 journal
("Start failed: mysql-mempool").
Replace the inject-then-sort logic with a pure helper
order_present_containers that orders only the actually-present
containers and never adds phantom entries. startup_order remains a
union of name variants across install generations — it's now used
purely to order what's live, not to inject what isn't. +3 unit tests.
Also harden bitcoin-knots.bats "valid state" probe: poll ≤30s for a
settled state instead of a single-shot read, so a container caught
mid-reconcile (transient restarting/configured) can't flake a 20-min
iteration. A genuinely-stuck container never settles, so real
breakage is still caught.
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
57a013bc66
commit
92d7f52dd6
@ -376,16 +376,31 @@ pub(super) fn startup_order(package_id: &str) -> &'static [&'static str] {
|
|||||||
/// order for the given app. Unknown containers sort to the end.
|
/// order for the given app. Unknown containers sort to the end.
|
||||||
pub(super) async fn ordered_containers_for_start(package_id: &str) -> Result<Vec<String>> {
|
pub(super) async fn ordered_containers_for_start(package_id: &str) -> Result<Vec<String>> {
|
||||||
let containers = get_containers_for_app(package_id).await?;
|
let containers = get_containers_for_app(package_id).await?;
|
||||||
|
Ok(order_present_containers(package_id, containers))
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Order the *actually-present* containers of an app by its dependency-aware
|
||||||
|
/// startup order. Containers whose name is unknown to the order list sort to
|
||||||
|
/// the end, preserving their relative input order.
|
||||||
|
///
|
||||||
|
/// This deliberately does NOT inject order entries that aren't live
|
||||||
|
/// containers. `startup_order` is a union of container-name variants across
|
||||||
|
/// install generations (e.g. `mysql-mempool` vs `archy-mempool-db`), so any
|
||||||
|
/// single install only ever has a subset of those names. Injecting a phantom
|
||||||
|
/// name makes the start path fail on a "no such object" inspect — and because
|
||||||
|
/// `do_orchestrator_package_start` propagates the unknown-app-id fallback
|
||||||
|
/// error via `?`, every later member (the api + frontend) is then skipped,
|
||||||
|
/// leaving the stack down until the health monitor recovers it minutes later.
|
||||||
|
/// That was the source of mempool gate flakes #73 (frontend) / #74 (api).
|
||||||
|
fn order_present_containers(package_id: &str, containers: Vec<String>) -> Vec<String> {
|
||||||
|
if containers.is_empty() {
|
||||||
|
// Nothing is live under any known name. Fall back to the package id so
|
||||||
|
// a single-container app whose container matches its id still gets one
|
||||||
|
// start attempt; multi-container stacks with no live members are
|
||||||
|
// surfaced as "no containers" by the caller's emptiness check.
|
||||||
|
return vec![package_id.to_string()];
|
||||||
|
}
|
||||||
let order = startup_order(package_id);
|
let order = startup_order(package_id);
|
||||||
if order.is_empty() && containers.is_empty() {
|
|
||||||
return Ok(vec![package_id.to_string()]);
|
|
||||||
}
|
|
||||||
let mut sorted = containers;
|
|
||||||
for required in order {
|
|
||||||
if !sorted.iter().any(|name| name == required) {
|
|
||||||
sorted.push((*required).to_string());
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// If no special order is defined, fall back to mempool order for legacy
|
// If no special order is defined, fall back to mempool order for legacy
|
||||||
// multi-container names that may still be returned by config lookups.
|
// multi-container names that may still be returned by config lookups.
|
||||||
let effective_order: &[&str] = if order.is_empty() {
|
let effective_order: &[&str] = if order.is_empty() {
|
||||||
@ -393,8 +408,14 @@ pub(super) async fn ordered_containers_for_start(package_id: &str) -> Result<Vec
|
|||||||
} else {
|
} else {
|
||||||
order
|
order
|
||||||
};
|
};
|
||||||
sorted.sort_by_key(|c| effective_order.iter().position(|o| *o == c).unwrap_or(99));
|
let mut sorted = containers;
|
||||||
Ok(sorted)
|
sorted.sort_by_key(|c| {
|
||||||
|
effective_order
|
||||||
|
.iter()
|
||||||
|
.position(|o| *o == c)
|
||||||
|
.unwrap_or(usize::MAX)
|
||||||
|
});
|
||||||
|
sorted
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Configure Fedimint Gateway to use LND instead of LDK.
|
/// Configure Fedimint Gateway to use LND instead of LDK.
|
||||||
@ -452,7 +473,48 @@ pub(super) fn configure_fedimint_lnd(
|
|||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
mod tests {
|
mod tests {
|
||||||
use super::{requires_unpruned_bitcoin, startup_order};
|
use super::{order_present_containers, requires_unpruned_bitcoin, startup_order};
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn order_present_containers_never_injects_phantom_stack_members() {
|
||||||
|
// The live mempool stack on a node: db + api + frontend. These are the
|
||||||
|
// only real container names; the startup_order list also contains
|
||||||
|
// variant/legacy names (mysql-mempool, archy-mempool-api, ...) that are
|
||||||
|
// NOT live here and must never appear in the result — a phantom name in
|
||||||
|
// the start list aborts the orchestrator start mid-sequence (gate
|
||||||
|
// #73/#74).
|
||||||
|
let present = vec![
|
||||||
|
"mempool".to_string(),
|
||||||
|
"mempool-api".to_string(),
|
||||||
|
"archy-mempool-db".to_string(),
|
||||||
|
];
|
||||||
|
let ordered = order_present_containers("mempool", present);
|
||||||
|
// Dependency order: db -> api -> frontend.
|
||||||
|
assert_eq!(ordered, vec!["archy-mempool-db", "mempool-api", "mempool"]);
|
||||||
|
// No phantom variants leaked in.
|
||||||
|
for phantom in ["mysql-mempool", "archy-mempool-api", "archy-mempool-web"] {
|
||||||
|
assert!(
|
||||||
|
!ordered.iter().any(|c| c == phantom),
|
||||||
|
"phantom {phantom} must not be injected"
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn order_present_containers_orders_known_before_unknown() {
|
||||||
|
let present = vec!["mempool".to_string(), "some-sidecar".to_string()];
|
||||||
|
let ordered = order_present_containers("mempool", present);
|
||||||
|
// The known frontend sorts ahead of an unknown sidecar.
|
||||||
|
assert_eq!(ordered, vec!["mempool", "some-sidecar"]);
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn order_present_containers_empty_falls_back_to_package_id() {
|
||||||
|
assert_eq!(
|
||||||
|
order_present_containers("mempool", vec![]),
|
||||||
|
vec!["mempool".to_string()]
|
||||||
|
);
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn btcpay_start_order_includes_required_stack_members() {
|
fn btcpay_start_order_includes_required_stack_members() {
|
||||||
|
|||||||
@ -36,11 +36,21 @@ teardown_file() {
|
|||||||
}
|
}
|
||||||
|
|
||||||
@test "container-list reports a valid state for bitcoin-knots" {
|
@test "container-list reports a valid state for bitcoin-knots" {
|
||||||
run rpc_result container-list
|
# Poll briefly: a container caught mid-reconcile can momentarily report a
|
||||||
[ "$status" -eq 0 ]
|
# transient state ("restarting"/"configured"/"removing") or no state at all.
|
||||||
local state
|
# A genuinely-stuck container never settles, so this still catches real
|
||||||
state=$(echo "$output" | jq -r '.[] | select(.name == "bitcoin-knots") | .state')
|
# breakage; it only absorbs churn (e.g. another container bouncing right
|
||||||
[[ "$state" =~ ^(running|stopped|exited|created|paused)$ ]]
|
# before the read-only tier runs).
|
||||||
|
local state="" deadline=$(( $(date +%s) + 30 ))
|
||||||
|
while (( $(date +%s) < deadline )); do
|
||||||
|
run rpc_result container-list
|
||||||
|
[ "$status" -eq 0 ]
|
||||||
|
state=$(echo "$output" | jq -r '.[] | select(.name == "bitcoin-knots") | .state')
|
||||||
|
[[ "$state" =~ ^(running|stopped|exited|created|paused)$ ]] && return 0
|
||||||
|
sleep 3
|
||||||
|
done
|
||||||
|
echo "bitcoin-knots never reported a settled valid state within 30s (last: '$state')" >&2
|
||||||
|
return 1
|
||||||
}
|
}
|
||||||
|
|
||||||
@test "container-status returns a valid status object for bitcoin-knots" {
|
@test "container-status returns a valid status object for bitcoin-knots" {
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user