fix(uninstall): bound systemctl/podman teardown so uninstall can't hang
Uninstalling immich/grafana could hang with a frozen full-red progress
bar, leave a ghost entry stuck in My Apps, and then refuse reinstall.
Single root cause: quadlet::disable_remove() — called first in the
uninstall task (via companion + orchestrator teardown) — ran
`systemctl --user stop`, daemon-reload, and `podman rm -f` with NO
timeout. On rootless podman a generated unit can wedge in "deactivating"
while podman hangs underneath, so `systemctl stop` blocks forever. The
spawned uninstall task then never returns Ok or Err, so:
- set_uninstall_stage() (after the stop) never fires → progress frozen;
- remove_package_state_entry() never runs → entry stranded in
`Removing` → ghost in My Apps;
- the install guard rejects reinstall with "already Removing".
The spawn wrapper already reverts state on Err and removes the entry on
Ok — the only failure mode was a hang that returns neither. Bound the
teardown so it always terminates:
- systemctl stop → QUADLET_STOP_TIMEOUT, escalate to kill+reset-failed
on timeout (reuses the existing helpers);
- daemon_reload_user() → bounded systemctl_user_status (30s);
- defensive `podman rm -f` → wrapped in tokio timeout.
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
2ebcd8f9a8
commit
71cc9ac46a
@ -581,11 +581,12 @@ pub async fn write_if_changed(unit: &QuadletUnit, dir: &Path) -> Result<bool> {
|
|||||||
/// Reload the user systemd manager. Required after any quadlet write
|
/// Reload the user systemd manager. Required after any quadlet write
|
||||||
/// or removal so systemd picks up the generated `.service` translation.
|
/// or removal so systemd picks up the generated `.service` translation.
|
||||||
pub async fn daemon_reload_user() -> Result<()> {
|
pub async fn daemon_reload_user() -> Result<()> {
|
||||||
let status = Command::new("systemctl")
|
// Bounded: a wedged user manager (e.g. a unit stuck "deactivating" while
|
||||||
.args(["--user", "daemon-reload"])
|
// podman hangs) could otherwise block daemon-reload indefinitely and freeze
|
||||||
.status()
|
// any caller — notably uninstall teardown.
|
||||||
|
let status = systemctl_user_status(&["daemon-reload"], Duration::from_secs(30))
|
||||||
.await
|
.await
|
||||||
.context("spawn systemctl --user daemon-reload")?;
|
.context("systemctl --user daemon-reload")?;
|
||||||
if !status.success() {
|
if !status.success() {
|
||||||
return Err(anyhow!("systemctl --user daemon-reload exited {status}"));
|
return Err(anyhow!("systemctl --user daemon-reload exited {status}"));
|
||||||
}
|
}
|
||||||
@ -787,11 +788,19 @@ fn directive_values(unit_body: &str, prefix: &str) -> Vec<String> {
|
|||||||
/// that systemd no longer knows about.
|
/// that systemd no longer knows about.
|
||||||
pub async fn disable_remove(unit_name: &str, dir: &Path) -> Result<()> {
|
pub async fn disable_remove(unit_name: &str, dir: &Path) -> Result<()> {
|
||||||
let svc = format!("{unit_name}.service");
|
let svc = format!("{unit_name}.service");
|
||||||
// Stop first; ignore failure (unit may already be down).
|
// Stop first; ignore failure (unit may already be down). BOUNDED — on
|
||||||
let _ = Command::new("systemctl")
|
// rootless podman a generated unit can wedge in "deactivating" while
|
||||||
.args(["--user", "stop", &svc])
|
// `podman rm -f` hangs underneath it, and an unbounded `systemctl stop`
|
||||||
.status()
|
// would block the entire uninstall forever: the progress bar freezes and
|
||||||
.await;
|
// the package entry is stranded in `Removing` (a ghost in My Apps that also
|
||||||
|
// blocks reinstall). If the graceful stop times out, escalate to
|
||||||
|
// SIGKILL + reset-failed so teardown always proceeds.
|
||||||
|
if systemctl_user_status(&["stop", &svc], QUADLET_STOP_TIMEOUT)
|
||||||
|
.await
|
||||||
|
.is_err()
|
||||||
|
{
|
||||||
|
let _ = kill_and_reset_service(&svc).await;
|
||||||
|
}
|
||||||
let path = dir.join(format!("{unit_name}.container"));
|
let path = dir.join(format!("{unit_name}.container"));
|
||||||
if fs::try_exists(&path).await.unwrap_or(false) {
|
if fs::try_exists(&path).await.unwrap_or(false) {
|
||||||
match fs::remove_file(&path).await {
|
match fs::remove_file(&path).await {
|
||||||
@ -802,9 +811,14 @@ pub async fn disable_remove(unit_name: &str, dir: &Path) -> Result<()> {
|
|||||||
}
|
}
|
||||||
daemon_reload_user().await.ok();
|
daemon_reload_user().await.ok();
|
||||||
// Defensive: kill the actual container too, in case quadlet left it.
|
// Defensive: kill the actual container too, in case quadlet left it.
|
||||||
let _ = Command::new("podman")
|
// Bounded so a hung podman store can't re-introduce the stall this function
|
||||||
|
// exists to avoid.
|
||||||
|
let _ = tokio::time::timeout(
|
||||||
|
QUADLET_STOP_TIMEOUT,
|
||||||
|
Command::new("podman")
|
||||||
.args(["rm", "-f", unit_name])
|
.args(["rm", "-f", unit_name])
|
||||||
.status()
|
.status(),
|
||||||
|
)
|
||||||
.await;
|
.await;
|
||||||
Ok(())
|
Ok(())
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user