fix(orchestrator): self-heal ANY installed app, not just baseline ones
The boot reconciler only self-healed a fully-absent container for one of
8 hardcoded "required baseline" apps (bitcoin-knots, electrumx, lnd,
mempool*, filebrowser, fedimint-clientd) — every other genuinely-installed
app whose container went missing (crash, lost record, wedged teardown)
was left as Left("absent") forever, with no path back short of an
explicit manual reinstall.
Surfaced live: indeedhub's backend containers (minio/postgres/relay) went
absent on .116 and never recovered despite indeedhub still being
installed. By the time this code path runs, the app is already confirmed
NOT user-stopped and NOT user-uninstalled (both checked earlier in the
same function, backed by durable markers correctly cleared on
reinstall/start) — so gating self-heal further behind a hardcoded app-id
list was an unnecessary restriction, not a safety measure. An app the
user installed and never removed should come back on its own, same as
baseline services always have.
Deleted the now-dead is_required_baseline_app(); updated the test that
had locked in the old (wrong) behavior.
Co-Authored-By: Claude Sonnet 5 <noreply@anthropic.com>
This commit is contained in:
parent
2c1d2a2572
commit
936b4cca29
@ -51,23 +51,6 @@ use crate::update::host_sudo;
|
||||
const UI_APP_IDS: &[&str] = &["bitcoin-ui", "electrs-ui", "lnd-ui"];
|
||||
const ARCHIVAL_BITCOIN_DISK_GB: u64 = 1000;
|
||||
|
||||
fn is_required_baseline_app(app_id: &str) -> bool {
|
||||
matches!(
|
||||
app_id,
|
||||
"bitcoin-knots"
|
||||
| "electrumx"
|
||||
| "lnd"
|
||||
| "mempool-api"
|
||||
| "mempool"
|
||||
| "archy-mempool-db"
|
||||
| "filebrowser"
|
||||
// fmcd: bundled on every node so the wallet's Fedimint side works
|
||||
// out of the box (auto-joins the default federation). Self-heals if
|
||||
// removed, like the other baseline services.
|
||||
| "fedimint-clientd"
|
||||
)
|
||||
}
|
||||
|
||||
fn is_restart_sensitive_app(app_id: &str) -> bool {
|
||||
matches!(
|
||||
app_id,
|
||||
@ -1633,11 +1616,11 @@ impl ProdContainerOrchestrator {
|
||||
}
|
||||
|
||||
// Same durability problem as user-stopped above, but for uninstall:
|
||||
// `is_required_baseline_app` below otherwise self-heals bitcoin-knots,
|
||||
// electrumx, lnd, mempool, etc. the moment their container is missing —
|
||||
// including right after an explicit uninstall, since the in-memory
|
||||
// `disabled` set doesn't survive a `load_manifests()` reload (every
|
||||
// archipelago restart/reboot runs one before the first reconcile).
|
||||
// any genuinely-installed app self-heals below the moment its
|
||||
// container is missing — including right after an explicit
|
||||
// uninstall, since the in-memory `disabled` set doesn't survive a
|
||||
// `load_manifests()` reload (every archipelago restart/reboot runs
|
||||
// one before the first reconcile).
|
||||
{
|
||||
let user_uninstalled =
|
||||
crate::crash_recovery::load_user_uninstalled(&self.data_dir).await;
|
||||
@ -1933,21 +1916,21 @@ impl ProdContainerOrchestrator {
|
||||
return Ok(ReconcileAction::Started);
|
||||
}
|
||||
|
||||
// Required baseline services must self-heal even if both the
|
||||
// podman record and Quadlet unit are gone. These are installed
|
||||
// by first boot and are prerequisites for dependent apps; an
|
||||
// "absent" result leaves the node permanently degraded after
|
||||
// crash cleanup.
|
||||
if mode == ReconcileMode::ExistingOnly && is_required_baseline_app(&app_id) {
|
||||
// By this point `app_id` is neither user-stopped nor
|
||||
// user-uninstalled (both checked earlier in this fn) and its
|
||||
// manifest is still loaded — i.e. it's a genuinely-installed
|
||||
// app whose container is simply gone (crash, lost record,
|
||||
// wedged teardown cleared by reboot). It must self-heal
|
||||
// regardless of whether it happens to be one of the hardcoded
|
||||
// "required baseline" apps: an app the user installed and
|
||||
// never removed should come back on its own, the same as
|
||||
// baseline services always have. `is_required_baseline_app`
|
||||
// used to gate this and left every other installed-but-absent
|
||||
// app (e.g. a stack's backend containers) stuck forever.
|
||||
if mode == ReconcileMode::ExistingOnly {
|
||||
self.install_fresh(lm).await?;
|
||||
return Ok(ReconcileAction::Installed);
|
||||
}
|
||||
|
||||
// Optional container missing entirely → leave absent during
|
||||
// boot reconcile; explicit install/start can create it.
|
||||
if mode == ReconcileMode::ExistingOnly {
|
||||
return Ok(ReconcileAction::Left("absent".to_string()));
|
||||
}
|
||||
self.install_fresh(lm).await?;
|
||||
Ok(ReconcileAction::Installed)
|
||||
}
|
||||
@ -3648,10 +3631,10 @@ impl ContainerOrchestrator for ProdContainerOrchestrator {
|
||||
}
|
||||
// Durable, unlike `state.disabled` above (wiped by every
|
||||
// `load_manifests()`, which runs on every archipelago restart/reboot
|
||||
// before the first reconcile) — without this, `is_required_baseline_app`
|
||||
// self-heals bitcoin-knots/electrumx/lnd/mempool/etc. right back after
|
||||
// an explicit uninstall survives to the next restart. Only mark on the
|
||||
// success path above — a failed removal means the app isn't actually gone.
|
||||
// before the first reconcile) — without this, reconcile's self-heal
|
||||
// would bring the app right back after an explicit uninstall
|
||||
// survives to the next restart. Only mark on the success path above
|
||||
// — a failed removal means the app isn't actually gone.
|
||||
crate::crash_recovery::mark_user_uninstalled(&self.data_dir, app_id).await;
|
||||
Ok(())
|
||||
}
|
||||
@ -4866,9 +4849,17 @@ app:
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn reconcile_existing_leaves_missing_optional_app_absent() {
|
||||
async fn reconcile_existing_self_heals_missing_optional_installed_app() {
|
||||
// A non-baseline app (gitea) whose manifest is still loaded (i.e.
|
||||
// genuinely installed, not user-uninstalled — see the
|
||||
// durable-user-uninstalled-marker test above for that case) must
|
||||
// self-heal the same as a required baseline app when its container
|
||||
// is fully gone. Leaving any installed-but-absent app stuck forever
|
||||
// regressed a real node (indeedhub's backend containers never came
|
||||
// back after going absent) — self-heal is no longer baseline-only.
|
||||
let rt = Arc::new(MockRuntime::default());
|
||||
let orch = orch_with(rt.clone()).await;
|
||||
let mut orch = orch_with(rt.clone()).await;
|
||||
orch.set_disk_gb_for_test(500);
|
||||
orch.insert_manifest_for_test(
|
||||
pull_manifest("gitea", "docker.io/gitea/gitea:latest"),
|
||||
PathBuf::from("/tmp/gitea"),
|
||||
@ -4879,13 +4870,13 @@ app:
|
||||
|
||||
assert_eq!(
|
||||
report.actions,
|
||||
vec![("gitea".to_string(), ReconcileAction::Left("absent".into()))]
|
||||
vec![("gitea".to_string(), ReconcileAction::Installed)]
|
||||
);
|
||||
assert!(report.failures.is_empty());
|
||||
let calls = rt.calls();
|
||||
assert!(!calls.iter().any(|c| c.starts_with("pull_image:")));
|
||||
assert!(!calls.iter().any(|c| c.starts_with("create_container:")));
|
||||
assert!(!calls.iter().any(|c| c.starts_with("start_container:")));
|
||||
assert!(calls.iter().any(|c| c.starts_with("pull_image:")));
|
||||
assert!(calls.iter().any(|c| c.starts_with("create_container:")));
|
||||
assert!(calls.iter().any(|c| c.starts_with("start_container:")));
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user