fix(orchestrator): recreate zombie "Up" containers whose process is dead
podman trusts its own state DB: when a container's conmon dies without
podman observing it (cgroup-cascade SIGKILL on archipelago.service
restart, a crash), `podman ps` keeps reporting it "Up" long after the
process is gone. The reconciler NoOp'd such a zombie forever, so a dead
dependency with no published host port never recovered.
Observed live on .228 (2026-06-25): netbird-dashboard reported "Up" with
a dead State.Pid → its nginx proxy 502'd → NetBird login broke
("Unauthenticated"). The dashboard publishes no host port, so the
Running branch had nothing to probe and never recreated it.
Add a zombie guard to the Running branch: verify the recorded State.Pid
is alive (its /proc entry exists) before trusting "running"; on a
concrete dead PID, stop+remove+install_fresh from the manifest.
Conservative by design — any uncertainty (inspect failed, PID
unparseable) assumes alive, so a transient podman hiccup never destroys
a healthy container. Unit test covers live/dead/out-of-range PIDs.
Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
This commit is contained in:
parent
43e700498b
commit
0a8db9044f
@ -701,6 +701,49 @@ async fn remove_stale_podman_socket_path(socket_path: &str) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// True when `pid` names a live process (its `/proc/<pid>` entry exists).
|
||||||
|
/// `pid <= 0` is never alive. (Best-effort: a reused PID can read as alive, but
|
||||||
|
/// that only delays zombie detection a cycle — it never recreates a healthy one.)
|
||||||
|
fn pid_is_alive(pid: i32) -> bool {
|
||||||
|
pid > 0 && Path::new(&format!("/proc/{pid}")).exists()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Whether the process backing a podman **"running"** container is actually alive.
|
||||||
|
///
|
||||||
|
/// Podman trusts its own state DB: if a container's conmon dies without podman
|
||||||
|
/// observing it (a cgroup-cascade SIGKILL when `archipelago.service` restarts, a
|
||||||
|
/// crash), `podman ps` keeps reporting the container **"Up"** long after the
|
||||||
|
/// process is gone — a ZOMBIE. It serves nothing (its port is dead), yet the
|
||||||
|
/// reconciler NoOps it forever because the state says Running. Verify the
|
||||||
|
/// recorded main PID is alive so the caller can recreate a zombie rather than
|
||||||
|
/// trust the stale "running".
|
||||||
|
///
|
||||||
|
/// Conservative by design: any uncertainty (inspect failed, PID unparseable)
|
||||||
|
/// returns `true` (assume alive) so a transient podman hiccup never destroys a
|
||||||
|
/// healthy container. Only a concrete, dead PID returns `false`.
|
||||||
|
///
|
||||||
|
/// Observed live on .228 (2026-06-25): `netbird-dashboard` reported "Up" with
|
||||||
|
/// `State.Pid` 1394766 already gone → its nginx proxy 502'd → NetBird login
|
||||||
|
/// broke ("Unauthenticated"). The reconciler never recovered it because the
|
||||||
|
/// dashboard publishes no host port, so the Running branch had nothing to probe.
|
||||||
|
async fn container_running_process_alive(name: &str) -> bool {
|
||||||
|
let out = match tokio::process::Command::new("podman")
|
||||||
|
.args(["inspect", "--format", "{{.State.Pid}}", name])
|
||||||
|
.output()
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(o) if o.status.success() => o,
|
||||||
|
_ => return true, // can't determine — don't destabilize a healthy app
|
||||||
|
};
|
||||||
|
match String::from_utf8_lossy(&out.stdout).trim().parse::<i32>() {
|
||||||
|
// A genuinely running container always has a supervised PID > 0 whose
|
||||||
|
// /proc entry exists. A dead PID (or PID <= 0 alongside state "running")
|
||||||
|
// is the anomaly we're catching.
|
||||||
|
Ok(pid) => pid_is_alive(pid),
|
||||||
|
Err(_) => true, // unparseable (older podman / odd output) — assume alive
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
async fn wait_for_container_stable_running(
|
async fn wait_for_container_stable_running(
|
||||||
runtime: &dyn ContainerRuntimeTrait,
|
runtime: &dyn ContainerRuntimeTrait,
|
||||||
name: &str,
|
name: &str,
|
||||||
@ -1450,6 +1493,26 @@ impl ProdContainerOrchestrator {
|
|||||||
}
|
}
|
||||||
match status.state {
|
match status.state {
|
||||||
ContainerState::Running => {
|
ContainerState::Running => {
|
||||||
|
// Zombie guard: podman can report a container "running"
|
||||||
|
// after its process has died (conmon SIGKILLed in a
|
||||||
|
// cgroup cascade on archipelago restart, etc.). Such a
|
||||||
|
// container serves nothing yet would be NoOp'd forever.
|
||||||
|
// Recreate it from the manifest. This is the ONLY path
|
||||||
|
// that recovers a dead dependency with no published host
|
||||||
|
// port (netbird-dashboard on .228, 2026-06-25 — stale
|
||||||
|
// "Up" → proxy 502 → NetBird login broke). Conservative:
|
||||||
|
// only fires on a concrete dead PID, never on uncertainty.
|
||||||
|
if !container_running_process_alive(&name).await {
|
||||||
|
tracing::warn!(
|
||||||
|
app_id = %app_id,
|
||||||
|
container = %name,
|
||||||
|
"container reported running but its process is dead (zombie) — recreating"
|
||||||
|
);
|
||||||
|
let _ = self.runtime.stop_container(&name).await;
|
||||||
|
let _ = self.runtime.remove_container(&name).await;
|
||||||
|
self.install_fresh(lm).await?;
|
||||||
|
return Ok(ReconcileAction::Installed);
|
||||||
|
}
|
||||||
// App-specific hooks get a chance to refresh bind-mounted
|
// App-specific hooks get a chance to refresh bind-mounted
|
||||||
// config. bitcoin-ui: re-render nginx.conf if the RPC
|
// config. bitcoin-ui: re-render nginx.conf if the RPC
|
||||||
// password rotated (or template changed via OTA). If
|
// password rotated (or template changed via OTA). If
|
||||||
@ -4829,4 +4892,17 @@ app:
|
|||||||
)
|
)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn pid_is_alive_detects_live_and_dead_pids() {
|
||||||
|
// Our own process is alive.
|
||||||
|
assert!(pid_is_alive(std::process::id() as i32));
|
||||||
|
// Non-positive PIDs are never alive (a "running" container with PID 0 is
|
||||||
|
// exactly the zombie case).
|
||||||
|
assert!(!pid_is_alive(0));
|
||||||
|
assert!(!pid_is_alive(-1));
|
||||||
|
// A PID far above the kernel's pid_max can't name a live process, so the
|
||||||
|
// zombie guard reports it dead → the reconciler recreates.
|
||||||
|
assert!(!pid_is_alive(2_000_000_000));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user