diff --git a/core/archipelago/src/container/prod_orchestrator.rs b/core/archipelago/src/container/prod_orchestrator.rs index db01442a..94794cd3 100644 --- a/core/archipelago/src/container/prod_orchestrator.rs +++ b/core/archipelago/src/container/prod_orchestrator.rs @@ -701,6 +701,49 @@ async fn remove_stale_podman_socket_path(socket_path: &str) { } } +/// True when `pid` names a live process (its `/proc/` entry exists). +/// `pid <= 0` is never alive. (Best-effort: a reused PID can read as alive, but +/// that only delays zombie detection a cycle — it never recreates a healthy one.) +fn pid_is_alive(pid: i32) -> bool { + pid > 0 && Path::new(&format!("/proc/{pid}")).exists() +} + +/// Whether the process backing a podman **"running"** container is actually alive. +/// +/// Podman trusts its own state DB: if a container's conmon dies without podman +/// observing it (a cgroup-cascade SIGKILL when `archipelago.service` restarts, a +/// crash), `podman ps` keeps reporting the container **"Up"** long after the +/// process is gone — a ZOMBIE. It serves nothing (its port is dead), yet the +/// reconciler NoOps it forever because the state says Running. Verify the +/// recorded main PID is alive so the caller can recreate a zombie rather than +/// trust the stale "running". +/// +/// Conservative by design: any uncertainty (inspect failed, PID unparseable) +/// returns `true` (assume alive) so a transient podman hiccup never destroys a +/// healthy container. Only a concrete, dead PID returns `false`. +/// +/// Observed live on .228 (2026-06-25): `netbird-dashboard` reported "Up" with +/// `State.Pid` 1394766 already gone → its nginx proxy 502'd → NetBird login +/// broke ("Unauthenticated"). The reconciler never recovered it because the +/// dashboard publishes no host port, so the Running branch had nothing to probe. +async fn container_running_process_alive(name: &str) -> bool { + let out = match tokio::process::Command::new("podman") + .args(["inspect", "--format", "{{.State.Pid}}", name]) + .output() + .await + { + Ok(o) if o.status.success() => o, + _ => return true, // can't determine — don't destabilize a healthy app + }; + match String::from_utf8_lossy(&out.stdout).trim().parse::() { + // A genuinely running container always has a supervised PID > 0 whose + // /proc entry exists. A dead PID (or PID <= 0 alongside state "running") + // is the anomaly we're catching. + Ok(pid) => pid_is_alive(pid), + Err(_) => true, // unparseable (older podman / odd output) — assume alive + } +} + async fn wait_for_container_stable_running( runtime: &dyn ContainerRuntimeTrait, name: &str, @@ -1450,6 +1493,26 @@ impl ProdContainerOrchestrator { } match status.state { ContainerState::Running => { + // Zombie guard: podman can report a container "running" + // after its process has died (conmon SIGKILLed in a + // cgroup cascade on archipelago restart, etc.). Such a + // container serves nothing yet would be NoOp'd forever. + // Recreate it from the manifest. This is the ONLY path + // that recovers a dead dependency with no published host + // port (netbird-dashboard on .228, 2026-06-25 — stale + // "Up" → proxy 502 → NetBird login broke). Conservative: + // only fires on a concrete dead PID, never on uncertainty. + if !container_running_process_alive(&name).await { + tracing::warn!( + app_id = %app_id, + container = %name, + "container reported running but its process is dead (zombie) — recreating" + ); + let _ = self.runtime.stop_container(&name).await; + let _ = self.runtime.remove_container(&name).await; + self.install_fresh(lm).await?; + return Ok(ReconcileAction::Installed); + } // App-specific hooks get a chance to refresh bind-mounted // config. bitcoin-ui: re-render nginx.conf if the RPC // password rotated (or template changed via OTA). If @@ -4829,4 +4892,17 @@ app: ) ); } + + #[test] + fn pid_is_alive_detects_live_and_dead_pids() { + // Our own process is alive. + assert!(pid_is_alive(std::process::id() as i32)); + // Non-positive PIDs are never alive (a "running" container with PID 0 is + // exactly the zombie case). + assert!(!pid_is_alive(0)); + assert!(!pid_is_alive(-1)); + // A PID far above the kernel's pid_max can't name a live process, so the + // zombie guard reports it dead → the reconciler recreates. + assert!(!pid_is_alive(2_000_000_000)); + } }