fix(orchestrator): self-heal ANY installed app, not just baseline ones

The boot reconciler only self-healed a fully-absent container for one of
8 hardcoded "required baseline" apps (bitcoin-knots, electrumx, lnd,
mempool*, filebrowser, fedimint-clientd) — every other genuinely-installed
app whose container went missing (crash, lost record, wedged teardown)
was left as Left("absent") forever, with no path back short of an
explicit manual reinstall.

Surfaced live: indeedhub's backend containers (minio/postgres/relay) went
absent on .116 and never recovered despite indeedhub still being
installed. By the time this code path runs, the app is already confirmed
NOT user-stopped and NOT user-uninstalled (both checked earlier in the
same function, backed by durable markers correctly cleared on
reinstall/start) — so gating self-heal further behind a hardcoded app-id
list was an unnecessary restriction, not a safety measure. An app the
user installed and never removed should come back on its own, same as
baseline services always have.

Deleted the now-dead is_required_baseline_app(); updated the test that
had locked in the old (wrong) behavior.

Co-Authored-By: Claude Sonnet 5 <noreply@anthropic.com>
This commit is contained in:
archipelago 2026-07-01 17:27:16 -04:00
parent 2c1d2a2572
commit 936b4cca29

View File

@ -51,23 +51,6 @@ use crate::update::host_sudo;
const UI_APP_IDS: &[&str] = &["bitcoin-ui", "electrs-ui", "lnd-ui"];
const ARCHIVAL_BITCOIN_DISK_GB: u64 = 1000;
fn is_required_baseline_app(app_id: &str) -> bool {
matches!(
app_id,
"bitcoin-knots"
| "electrumx"
| "lnd"
| "mempool-api"
| "mempool"
| "archy-mempool-db"
| "filebrowser"
// fmcd: bundled on every node so the wallet's Fedimint side works
// out of the box (auto-joins the default federation). Self-heals if
// removed, like the other baseline services.
| "fedimint-clientd"
)
}
fn is_restart_sensitive_app(app_id: &str) -> bool {
matches!(
app_id,
@ -1633,11 +1616,11 @@ impl ProdContainerOrchestrator {
}
// Same durability problem as user-stopped above, but for uninstall:
// `is_required_baseline_app` below otherwise self-heals bitcoin-knots,
// electrumx, lnd, mempool, etc. the moment their container is missing —
// including right after an explicit uninstall, since the in-memory
// `disabled` set doesn't survive a `load_manifests()` reload (every
// archipelago restart/reboot runs one before the first reconcile).
// any genuinely-installed app self-heals below the moment its
// container is missing — including right after an explicit
// uninstall, since the in-memory `disabled` set doesn't survive a
// `load_manifests()` reload (every archipelago restart/reboot runs
// one before the first reconcile).
{
let user_uninstalled =
crate::crash_recovery::load_user_uninstalled(&self.data_dir).await;
@ -1933,21 +1916,21 @@ impl ProdContainerOrchestrator {
return Ok(ReconcileAction::Started);
}
// Required baseline services must self-heal even if both the
// podman record and Quadlet unit are gone. These are installed
// by first boot and are prerequisites for dependent apps; an
// "absent" result leaves the node permanently degraded after
// crash cleanup.
if mode == ReconcileMode::ExistingOnly && is_required_baseline_app(&app_id) {
// By this point `app_id` is neither user-stopped nor
// user-uninstalled (both checked earlier in this fn) and its
// manifest is still loaded — i.e. it's a genuinely-installed
// app whose container is simply gone (crash, lost record,
// wedged teardown cleared by reboot). It must self-heal
// regardless of whether it happens to be one of the hardcoded
// "required baseline" apps: an app the user installed and
// never removed should come back on its own, the same as
// baseline services always have. `is_required_baseline_app`
// used to gate this and left every other installed-but-absent
// app (e.g. a stack's backend containers) stuck forever.
if mode == ReconcileMode::ExistingOnly {
self.install_fresh(lm).await?;
return Ok(ReconcileAction::Installed);
}
// Optional container missing entirely → leave absent during
// boot reconcile; explicit install/start can create it.
if mode == ReconcileMode::ExistingOnly {
return Ok(ReconcileAction::Left("absent".to_string()));
}
self.install_fresh(lm).await?;
Ok(ReconcileAction::Installed)
}
@ -3648,10 +3631,10 @@ impl ContainerOrchestrator for ProdContainerOrchestrator {
}
// Durable, unlike `state.disabled` above (wiped by every
// `load_manifests()`, which runs on every archipelago restart/reboot
// before the first reconcile) — without this, `is_required_baseline_app`
// self-heals bitcoin-knots/electrumx/lnd/mempool/etc. right back after
// an explicit uninstall survives to the next restart. Only mark on the
// success path above — a failed removal means the app isn't actually gone.
// before the first reconcile) — without this, reconcile's self-heal
// would bring the app right back after an explicit uninstall
// survives to the next restart. Only mark on the success path above
// — a failed removal means the app isn't actually gone.
crate::crash_recovery::mark_user_uninstalled(&self.data_dir, app_id).await;
Ok(())
}
@ -4866,9 +4849,17 @@ app:
}
#[tokio::test]
async fn reconcile_existing_leaves_missing_optional_app_absent() {
async fn reconcile_existing_self_heals_missing_optional_installed_app() {
// A non-baseline app (gitea) whose manifest is still loaded (i.e.
// genuinely installed, not user-uninstalled — see the
// durable-user-uninstalled-marker test above for that case) must
// self-heal the same as a required baseline app when its container
// is fully gone. Leaving any installed-but-absent app stuck forever
// regressed a real node (indeedhub's backend containers never came
// back after going absent) — self-heal is no longer baseline-only.
let rt = Arc::new(MockRuntime::default());
let orch = orch_with(rt.clone()).await;
let mut orch = orch_with(rt.clone()).await;
orch.set_disk_gb_for_test(500);
orch.insert_manifest_for_test(
pull_manifest("gitea", "docker.io/gitea/gitea:latest"),
PathBuf::from("/tmp/gitea"),
@ -4879,13 +4870,13 @@ app:
assert_eq!(
report.actions,
vec![("gitea".to_string(), ReconcileAction::Left("absent".into()))]
vec![("gitea".to_string(), ReconcileAction::Installed)]
);
assert!(report.failures.is_empty());
let calls = rt.calls();
assert!(!calls.iter().any(|c| c.starts_with("pull_image:")));
assert!(!calls.iter().any(|c| c.starts_with("create_container:")));
assert!(!calls.iter().any(|c| c.starts_with("start_container:")));
assert!(calls.iter().any(|c| c.starts_with("pull_image:")));
assert!(calls.iter().any(|c| c.starts_with("create_container:")));
assert!(calls.iter().any(|c| c.starts_with("start_container:")));
}
#[tokio::test]