From 8f13298805cde66f7b9a6fac2412ed663d8d37ef Mon Sep 17 00:00:00 2001 From: archipelago Date: Fri, 1 May 2026 15:23:36 -0400 Subject: [PATCH] fix(bootstrap): self-heal wedged podman runtime state at startup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes FM6 (podman bolt_state.db / runtime drift) — observed live on .198 today: bitcoind was running for several minutes, but podman's state DB reported the container as Exited. The reconciler then tried to "restart" it, racing the still-bound port 8332 and failing in a loop. heal_podman_state() runs as the last bootstrap stage, BEFORE the orchestrator's reconcile loop ticks. It probes `podman info` with a 5s timeout; on failure it removes the runtime-state dirs under $XDG_RUNTIME_DIR and re-probes. Persistent storage under ~/.local/share/containers/storage/ is never touched, so containers re-discover from manifests on next call. Cleanup never includes `podman system reset` or `system renumber` — those are destructive and must stay operator-only. Co-Authored-By: Claude Opus 4.7 (1M context) --- core/archipelago/src/bootstrap.rs | 85 +++++++++++++++++++++++++++++++ 1 file changed, 85 insertions(+) diff --git a/core/archipelago/src/bootstrap.rs b/core/archipelago/src/bootstrap.rs index 281abc60..6cd642a2 100644 --- a/core/archipelago/src/bootstrap.rs +++ b/core/archipelago/src/bootstrap.rs @@ -71,6 +71,91 @@ pub async fn ensure_doctor_installed() { Ok(_) => debug!("Secrets directory already at expected mode"), Err(e) => warn!("Secrets dir tightening failed (non-fatal): {:#}", e), } + // Podman self-heal MUST be the last bootstrap stage. If podman's + // runtime state is wedged, the orchestrator's first reconcile tick + // (which fires seconds after bootstrap returns) will hang or error + // on every container. Cleaning the runroot here gives the rest of + // the process a healthy podman to talk to. + match heal_podman_state().await { + Ok(PodmanHealOutcome::Healthy) => debug!("podman runtime state healthy"), + Ok(PodmanHealOutcome::Cleaned) => warn!( + "podman runtime state was wedged at startup — cleaned runroot and re-probed (CRITICAL)" + ), + Err(e) => warn!("podman self-heal failed (non-fatal, will retry next boot): {:#}", e), + } +} + +#[derive(Debug, PartialEq, Eq)] +enum PodmanHealOutcome { + Healthy, + Cleaned, +} + +/// Probe `podman info`. If it succeeds the daemon's runtime state is +/// fine and we return `Healthy` immediately. If it times out, fails to +/// spawn, or returns an "invalid internal status" / "database state" +/// error, the runtime state in `$XDG_RUNTIME_DIR/{containers,libpod,podman}` +/// is likely wedged. We delete it and re-probe — podman rebuilds the +/// runtime state from persistent state under +/// `$HOME/.local/share/containers/storage/`. +/// +/// Why this is safe at startup: +/// - We run BEFORE the orchestrator starts its reconcile loop, so no +/// archipelago code is currently calling podman. +/// - Persistent container metadata lives under +/// `~/.local/share/containers/`, which we never touch. +/// - `unless-stopped` containers and Quadlet-supervised services are +/// parented under user.slice, not archipelago.service, so they keep +/// running even while we clean podman's runtime view of them. After +/// the cleanup + re-probe podman re-discovers them. +/// +/// What this does NOT cover: +/// - Storage corruption under `~/.local/share/containers/storage/`. +/// That requires a destructive `podman system reset`, which we will +/// never do automatically — operator must intervene. +/// - Networking corruption (netavark cache). Currently `podman info` +/// doesn't diagnose that; if cleanup doesn't fix it, the operator +/// will see the warning in the journal. +async fn heal_podman_state() -> Result { + if probe_podman_ok().await { + return Ok(PodmanHealOutcome::Healthy); + } + // Wedged. Clean runtime state and try again. + let xdg = std::env::var("XDG_RUNTIME_DIR") + .context("XDG_RUNTIME_DIR not set; can't locate podman runtime state to clean")?; + for sub in &["containers", "libpod", "podman"] { + let path = PathBuf::from(&xdg).join(sub); + match fs::remove_dir_all(&path).await { + Ok(()) => debug!(path = %path.display(), "removed podman runtime state dir"), + Err(e) if e.kind() == std::io::ErrorKind::NotFound => {} + Err(e) => warn!(path = %path.display(), "remove failed: {}", e), + } + } + if probe_podman_ok().await { + Ok(PodmanHealOutcome::Cleaned) + } else { + Err(anyhow::anyhow!( + "podman info still failing after runtime cleanup; storage may be corrupt — operator must intervene" + )) + } +} + +/// True iff `podman info` returns 0 within 5s. Any timeout, spawn +/// failure, or non-zero exit reads as "wedged" and triggers cleanup. +async fn probe_podman_ok() -> bool { + use std::time::Duration; + let probe = tokio::time::timeout( + Duration::from_secs(5), + tokio::process::Command::new("podman") + .arg("info") + .arg("--format=json") + .output(), + ) + .await; + match probe { + Ok(Ok(out)) => out.status.success(), + Ok(Err(_)) | Err(_) => false, + } } /// Make sure /var/lib/archipelago/secrets/ stays 0700 owned by archipelago,