fix(bootstrap): self-heal wedged podman runtime state at startup

Closes FM6 (podman bolt_state.db / runtime drift) — observed live on .198 today: bitcoind was running for several minutes, but podman's state DB reported the container as Exited. The reconciler then tried to "restart" it, racing the still-bound port 8332 and failing in a loop. heal_podman_state() runs as the last bootstrap stage, BEFORE the orchestrator's reconcile loop ticks. It probes `podman info` with a 5s timeout; on failure it removes the runtime-state dirs under $XDG_RUNTIME_DIR and re-probes. Persistent storage under ~/.local/share/containers/storage/ is never touched, so containers re-discover from manifests on next call. Cleanup never includes `podman system reset` or `system renumber` — those are destructive and must stay operator-only. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-01 15:23:36 -04:00 · 2026-05-01 15:23:36 -04:00 · 8f13298805
commit 8f13298805
parent ba2eece9aa
1 changed files with 85 additions and 0 deletions
--- a/core/archipelago/src/bootstrap.rs
+++ b/core/archipelago/src/bootstrap.rs
@ -71,6 +71,91 @@ pub async fn ensure_doctor_installed() {
        Ok(_) => debug!("Secrets directory already at expected mode"),
        Err(e) => warn!("Secrets dir tightening failed (non-fatal): {:#}", e),
    }
+    // Podman self-heal MUST be the last bootstrap stage. If podman's
+    // runtime state is wedged, the orchestrator's first reconcile tick
+    // (which fires seconds after bootstrap returns) will hang or error
+    // on every container. Cleaning the runroot here gives the rest of
+    // the process a healthy podman to talk to.
+    match heal_podman_state().await {
+        Ok(PodmanHealOutcome::Healthy) => debug!("podman runtime state healthy"),
+        Ok(PodmanHealOutcome::Cleaned) => warn!(
+            "podman runtime state was wedged at startup — cleaned runroot and re-probed (CRITICAL)"
+        ),
+        Err(e) => warn!("podman self-heal failed (non-fatal, will retry next boot): {:#}", e),
+    }
+}
+
+#[derive(Debug, PartialEq, Eq)]
+enum PodmanHealOutcome {
+    Healthy,
+    Cleaned,
+}
+
+/// Probe `podman info`. If it succeeds the daemon's runtime state is
+/// fine and we return `Healthy` immediately. If it times out, fails to
+/// spawn, or returns an "invalid internal status" / "database state"
+/// error, the runtime state in `$XDG_RUNTIME_DIR/{containers,libpod,podman}`
+/// is likely wedged. We delete it and re-probe — podman rebuilds the
+/// runtime state from persistent state under
+/// `$HOME/.local/share/containers/storage/`.
+///
+/// Why this is safe at startup:
+/// - We run BEFORE the orchestrator starts its reconcile loop, so no
+///   archipelago code is currently calling podman.
+/// - Persistent container metadata lives under
+///   `~/.local/share/containers/`, which we never touch.
+/// - `unless-stopped` containers and Quadlet-supervised services are
+///   parented under user.slice, not archipelago.service, so they keep
+///   running even while we clean podman's runtime view of them. After
+///   the cleanup + re-probe podman re-discovers them.
+///
+/// What this does NOT cover:
+/// - Storage corruption under `~/.local/share/containers/storage/`.
+///   That requires a destructive `podman system reset`, which we will
+///   never do automatically — operator must intervene.
+/// - Networking corruption (netavark cache). Currently `podman info`
+///   doesn't diagnose that; if cleanup doesn't fix it, the operator
+///   will see the warning in the journal.
+async fn heal_podman_state() -> Result<PodmanHealOutcome> {
+    if probe_podman_ok().await {
+        return Ok(PodmanHealOutcome::Healthy);
+    }
+    // Wedged. Clean runtime state and try again.
+    let xdg = std::env::var("XDG_RUNTIME_DIR")
+        .context("XDG_RUNTIME_DIR not set; can't locate podman runtime state to clean")?;
+    for sub in &["containers", "libpod", "podman"] {
+        let path = PathBuf::from(&xdg).join(sub);
+        match fs::remove_dir_all(&path).await {
+            Ok(()) => debug!(path = %path.display(), "removed podman runtime state dir"),
+            Err(e) if e.kind() == std::io::ErrorKind::NotFound => {}
+            Err(e) => warn!(path = %path.display(), "remove failed: {}", e),
+        }
+    }
+    if probe_podman_ok().await {
+        Ok(PodmanHealOutcome::Cleaned)
+    } else {
+        Err(anyhow::anyhow!(
+            "podman info still failing after runtime cleanup; storage may be corrupt — operator must intervene"
+        ))
+    }
+}
+
+/// True iff `podman info` returns 0 within 5s. Any timeout, spawn
+/// failure, or non-zero exit reads as "wedged" and triggers cleanup.
+async fn probe_podman_ok() -> bool {
+    use std::time::Duration;
+    let probe = tokio::time::timeout(
+        Duration::from_secs(5),
+        tokio::process::Command::new("podman")
+            .arg("info")
+            .arg("--format=json")
+            .output(),
+    )
+    .await;
+    match probe {
+        Ok(Ok(out)) => out.status.success(),
+        Ok(Err(_)) | Err(_) => false,
+    }
 }

 /// Make sure /var/lib/archipelago/secrets/ stays 0700 owned by archipelago,