From 71cc9ac46a75ea1cc89f1fd969f9293c287c2312 Mon Sep 17 00:00:00 2001 From: archipelago Date: Fri, 26 Jun 2026 04:27:02 -0400 Subject: [PATCH] fix(uninstall): bound systemctl/podman teardown so uninstall can't hang MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Uninstalling immich/grafana could hang with a frozen full-red progress bar, leave a ghost entry stuck in My Apps, and then refuse reinstall. Single root cause: quadlet::disable_remove() — called first in the uninstall task (via companion + orchestrator teardown) — ran `systemctl --user stop`, daemon-reload, and `podman rm -f` with NO timeout. On rootless podman a generated unit can wedge in "deactivating" while podman hangs underneath, so `systemctl stop` blocks forever. The spawned uninstall task then never returns Ok or Err, so: - set_uninstall_stage() (after the stop) never fires → progress frozen; - remove_package_state_entry() never runs → entry stranded in `Removing` → ghost in My Apps; - the install guard rejects reinstall with "already Removing". The spawn wrapper already reverts state on Err and removes the entry on Ok — the only failure mode was a hang that returns neither. Bound the teardown so it always terminates: - systemctl stop → QUADLET_STOP_TIMEOUT, escalate to kill+reset-failed on timeout (reuses the existing helpers); - daemon_reload_user() → bounded systemctl_user_status (30s); - defensive `podman rm -f` → wrapped in tokio timeout. Co-Authored-By: Claude Opus 4.8 (1M context) --- core/archipelago/src/container/quadlet.rs | 40 +++++++++++++++-------- 1 file changed, 27 insertions(+), 13 deletions(-) diff --git a/core/archipelago/src/container/quadlet.rs b/core/archipelago/src/container/quadlet.rs index 2e76cf80..9250e841 100644 --- a/core/archipelago/src/container/quadlet.rs +++ b/core/archipelago/src/container/quadlet.rs @@ -581,11 +581,12 @@ pub async fn write_if_changed(unit: &QuadletUnit, dir: &Path) -> Result { /// Reload the user systemd manager. Required after any quadlet write /// or removal so systemd picks up the generated `.service` translation. pub async fn daemon_reload_user() -> Result<()> { - let status = Command::new("systemctl") - .args(["--user", "daemon-reload"]) - .status() + // Bounded: a wedged user manager (e.g. a unit stuck "deactivating" while + // podman hangs) could otherwise block daemon-reload indefinitely and freeze + // any caller — notably uninstall teardown. + let status = systemctl_user_status(&["daemon-reload"], Duration::from_secs(30)) .await - .context("spawn systemctl --user daemon-reload")?; + .context("systemctl --user daemon-reload")?; if !status.success() { return Err(anyhow!("systemctl --user daemon-reload exited {status}")); } @@ -787,11 +788,19 @@ fn directive_values(unit_body: &str, prefix: &str) -> Vec { /// that systemd no longer knows about. pub async fn disable_remove(unit_name: &str, dir: &Path) -> Result<()> { let svc = format!("{unit_name}.service"); - // Stop first; ignore failure (unit may already be down). - let _ = Command::new("systemctl") - .args(["--user", "stop", &svc]) - .status() - .await; + // Stop first; ignore failure (unit may already be down). BOUNDED — on + // rootless podman a generated unit can wedge in "deactivating" while + // `podman rm -f` hangs underneath it, and an unbounded `systemctl stop` + // would block the entire uninstall forever: the progress bar freezes and + // the package entry is stranded in `Removing` (a ghost in My Apps that also + // blocks reinstall). If the graceful stop times out, escalate to + // SIGKILL + reset-failed so teardown always proceeds. + if systemctl_user_status(&["stop", &svc], QUADLET_STOP_TIMEOUT) + .await + .is_err() + { + let _ = kill_and_reset_service(&svc).await; + } let path = dir.join(format!("{unit_name}.container")); if fs::try_exists(&path).await.unwrap_or(false) { match fs::remove_file(&path).await { @@ -802,10 +811,15 @@ pub async fn disable_remove(unit_name: &str, dir: &Path) -> Result<()> { } daemon_reload_user().await.ok(); // Defensive: kill the actual container too, in case quadlet left it. - let _ = Command::new("podman") - .args(["rm", "-f", unit_name]) - .status() - .await; + // Bounded so a hung podman store can't re-introduce the stall this function + // exists to avoid. + let _ = tokio::time::timeout( + QUADLET_STOP_TIMEOUT, + Command::new("podman") + .args(["rm", "-f", unit_name]) + .status(), + ) + .await; Ok(()) }