From 5be2febe130c0360fd272a0ff8ed1a5137f35d32 Mon Sep 17 00:00:00 2001 From: archipelago Date: Fri, 1 May 2026 15:57:15 -0400 Subject: [PATCH] fix(bootstrap): don't nuke podman socket dir during runtime self-heal MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Observed live on .198: heal_podman_state was removing $XDG_RUNTIME_DIR/podman/ alongside containers/ and libpod/. That dir holds the systemd-bound podman.sock — the listener systemd creates for socket-activated podman.service. Removing it broke every libpod HTTP call from the orchestrator until `systemctl --user restart podman.socket` ran. Far worse than any wedge it was trying to repair. Drop podman/ from the cleanup list. The runtime state we actually want to clean for FM6 (bolt_state.db drift) lives in containers/ and libpod/ only. Co-Authored-By: Claude Opus 4.7 (1M context) --- core/archipelago/src/bootstrap.rs | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/core/archipelago/src/bootstrap.rs b/core/archipelago/src/bootstrap.rs index 6cd642a2..5666644d 100644 --- a/core/archipelago/src/bootstrap.rs +++ b/core/archipelago/src/bootstrap.rs @@ -94,11 +94,17 @@ enum PodmanHealOutcome { /// Probe `podman info`. If it succeeds the daemon's runtime state is /// fine and we return `Healthy` immediately. If it times out, fails to /// spawn, or returns an "invalid internal status" / "database state" -/// error, the runtime state in `$XDG_RUNTIME_DIR/{containers,libpod,podman}` -/// is likely wedged. We delete it and re-probe — podman rebuilds the -/// runtime state from persistent state under +/// error, the runtime state under `$XDG_RUNTIME_DIR/{containers,libpod}` +/// is likely wedged. We delete those two dirs and re-probe — podman +/// rebuilds runtime state from persistent storage under /// `$HOME/.local/share/containers/storage/`. /// +/// `$XDG_RUNTIME_DIR/podman/` is **deliberately not touched**: that's +/// where systemd's socket-activated `podman.sock` listener lives. If we +/// removed it, every libpod HTTP call from the orchestrator would fail +/// with "connection refused" until `systemctl --user restart +/// podman.socket` ran — far worse than the wedge we'd be trying to fix. +/// /// Why this is safe at startup: /// - We run BEFORE the orchestrator starts its reconcile loop, so no /// archipelago code is currently calling podman. @@ -120,10 +126,11 @@ async fn heal_podman_state() -> Result { if probe_podman_ok().await { return Ok(PodmanHealOutcome::Healthy); } - // Wedged. Clean runtime state and try again. + // Wedged. Clean runtime state and try again. Note: `podman/` is + // intentionally absent from this list — see fn docstring. let xdg = std::env::var("XDG_RUNTIME_DIR") .context("XDG_RUNTIME_DIR not set; can't locate podman runtime state to clean")?; - for sub in &["containers", "libpod", "podman"] { + for sub in &["containers", "libpod"] { let path = PathBuf::from(&xdg).join(sub); match fs::remove_dir_all(&path).await { Ok(()) => debug!(path = %path.display(), "removed podman runtime state dir"),