diff --git a/core/archipelago/src/container/prod_orchestrator.rs b/core/archipelago/src/container/prod_orchestrator.rs index 572dd5ad..43c283d1 100644 --- a/core/archipelago/src/container/prod_orchestrator.rs +++ b/core/archipelago/src/container/prod_orchestrator.rs @@ -51,23 +51,6 @@ use crate::update::host_sudo; const UI_APP_IDS: &[&str] = &["bitcoin-ui", "electrs-ui", "lnd-ui"]; const ARCHIVAL_BITCOIN_DISK_GB: u64 = 1000; -fn is_required_baseline_app(app_id: &str) -> bool { - matches!( - app_id, - "bitcoin-knots" - | "electrumx" - | "lnd" - | "mempool-api" - | "mempool" - | "archy-mempool-db" - | "filebrowser" - // fmcd: bundled on every node so the wallet's Fedimint side works - // out of the box (auto-joins the default federation). Self-heals if - // removed, like the other baseline services. - | "fedimint-clientd" - ) -} - fn is_restart_sensitive_app(app_id: &str) -> bool { matches!( app_id, @@ -1633,11 +1616,11 @@ impl ProdContainerOrchestrator { } // Same durability problem as user-stopped above, but for uninstall: - // `is_required_baseline_app` below otherwise self-heals bitcoin-knots, - // electrumx, lnd, mempool, etc. the moment their container is missing — - // including right after an explicit uninstall, since the in-memory - // `disabled` set doesn't survive a `load_manifests()` reload (every - // archipelago restart/reboot runs one before the first reconcile). + // any genuinely-installed app self-heals below the moment its + // container is missing — including right after an explicit + // uninstall, since the in-memory `disabled` set doesn't survive a + // `load_manifests()` reload (every archipelago restart/reboot runs + // one before the first reconcile). { let user_uninstalled = crate::crash_recovery::load_user_uninstalled(&self.data_dir).await; @@ -1933,21 +1916,21 @@ impl ProdContainerOrchestrator { return Ok(ReconcileAction::Started); } - // Required baseline services must self-heal even if both the - // podman record and Quadlet unit are gone. These are installed - // by first boot and are prerequisites for dependent apps; an - // "absent" result leaves the node permanently degraded after - // crash cleanup. - if mode == ReconcileMode::ExistingOnly && is_required_baseline_app(&app_id) { + // By this point `app_id` is neither user-stopped nor + // user-uninstalled (both checked earlier in this fn) and its + // manifest is still loaded — i.e. it's a genuinely-installed + // app whose container is simply gone (crash, lost record, + // wedged teardown cleared by reboot). It must self-heal + // regardless of whether it happens to be one of the hardcoded + // "required baseline" apps: an app the user installed and + // never removed should come back on its own, the same as + // baseline services always have. `is_required_baseline_app` + // used to gate this and left every other installed-but-absent + // app (e.g. a stack's backend containers) stuck forever. + if mode == ReconcileMode::ExistingOnly { self.install_fresh(lm).await?; return Ok(ReconcileAction::Installed); } - - // Optional container missing entirely → leave absent during - // boot reconcile; explicit install/start can create it. - if mode == ReconcileMode::ExistingOnly { - return Ok(ReconcileAction::Left("absent".to_string())); - } self.install_fresh(lm).await?; Ok(ReconcileAction::Installed) } @@ -3648,10 +3631,10 @@ impl ContainerOrchestrator for ProdContainerOrchestrator { } // Durable, unlike `state.disabled` above (wiped by every // `load_manifests()`, which runs on every archipelago restart/reboot - // before the first reconcile) — without this, `is_required_baseline_app` - // self-heals bitcoin-knots/electrumx/lnd/mempool/etc. right back after - // an explicit uninstall survives to the next restart. Only mark on the - // success path above — a failed removal means the app isn't actually gone. + // before the first reconcile) — without this, reconcile's self-heal + // would bring the app right back after an explicit uninstall + // survives to the next restart. Only mark on the success path above + // — a failed removal means the app isn't actually gone. crate::crash_recovery::mark_user_uninstalled(&self.data_dir, app_id).await; Ok(()) } @@ -4866,9 +4849,17 @@ app: } #[tokio::test] - async fn reconcile_existing_leaves_missing_optional_app_absent() { + async fn reconcile_existing_self_heals_missing_optional_installed_app() { + // A non-baseline app (gitea) whose manifest is still loaded (i.e. + // genuinely installed, not user-uninstalled — see the + // durable-user-uninstalled-marker test above for that case) must + // self-heal the same as a required baseline app when its container + // is fully gone. Leaving any installed-but-absent app stuck forever + // regressed a real node (indeedhub's backend containers never came + // back after going absent) — self-heal is no longer baseline-only. let rt = Arc::new(MockRuntime::default()); - let orch = orch_with(rt.clone()).await; + let mut orch = orch_with(rt.clone()).await; + orch.set_disk_gb_for_test(500); orch.insert_manifest_for_test( pull_manifest("gitea", "docker.io/gitea/gitea:latest"), PathBuf::from("/tmp/gitea"), @@ -4879,13 +4870,13 @@ app: assert_eq!( report.actions, - vec![("gitea".to_string(), ReconcileAction::Left("absent".into()))] + vec![("gitea".to_string(), ReconcileAction::Installed)] ); assert!(report.failures.is_empty()); let calls = rt.calls(); - assert!(!calls.iter().any(|c| c.starts_with("pull_image:"))); - assert!(!calls.iter().any(|c| c.starts_with("create_container:"))); - assert!(!calls.iter().any(|c| c.starts_with("start_container:"))); + assert!(calls.iter().any(|c| c.starts_with("pull_image:"))); + assert!(calls.iter().any(|c| c.starts_with("create_container:"))); + assert!(calls.iter().any(|c| c.starts_with("start_container:"))); } #[tokio::test]