Compare commits
3 Commits
43e700498b
...
3515344800
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
3515344800 | ||
|
|
670ebb0666 | ||
|
|
0a8db9044f |
@ -701,6 +701,49 @@ async fn remove_stale_podman_socket_path(socket_path: &str) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// True when `pid` names a live process (its `/proc/<pid>` entry exists).
|
||||||
|
/// `pid <= 0` is never alive. (Best-effort: a reused PID can read as alive, but
|
||||||
|
/// that only delays zombie detection a cycle — it never recreates a healthy one.)
|
||||||
|
fn pid_is_alive(pid: i32) -> bool {
|
||||||
|
pid > 0 && Path::new(&format!("/proc/{pid}")).exists()
|
||||||
|
}
|
||||||
|
|
||||||
|
/// Whether the process backing a podman **"running"** container is actually alive.
|
||||||
|
///
|
||||||
|
/// Podman trusts its own state DB: if a container's conmon dies without podman
|
||||||
|
/// observing it (a cgroup-cascade SIGKILL when `archipelago.service` restarts, a
|
||||||
|
/// crash), `podman ps` keeps reporting the container **"Up"** long after the
|
||||||
|
/// process is gone — a ZOMBIE. It serves nothing (its port is dead), yet the
|
||||||
|
/// reconciler NoOps it forever because the state says Running. Verify the
|
||||||
|
/// recorded main PID is alive so the caller can recreate a zombie rather than
|
||||||
|
/// trust the stale "running".
|
||||||
|
///
|
||||||
|
/// Conservative by design: any uncertainty (inspect failed, PID unparseable)
|
||||||
|
/// returns `true` (assume alive) so a transient podman hiccup never destroys a
|
||||||
|
/// healthy container. Only a concrete, dead PID returns `false`.
|
||||||
|
///
|
||||||
|
/// Observed live on .228 (2026-06-25): `netbird-dashboard` reported "Up" with
|
||||||
|
/// `State.Pid` 1394766 already gone → its nginx proxy 502'd → NetBird login
|
||||||
|
/// broke ("Unauthenticated"). The reconciler never recovered it because the
|
||||||
|
/// dashboard publishes no host port, so the Running branch had nothing to probe.
|
||||||
|
async fn container_running_process_alive(name: &str) -> bool {
|
||||||
|
let out = match tokio::process::Command::new("podman")
|
||||||
|
.args(["inspect", "--format", "{{.State.Pid}}", name])
|
||||||
|
.output()
|
||||||
|
.await
|
||||||
|
{
|
||||||
|
Ok(o) if o.status.success() => o,
|
||||||
|
_ => return true, // can't determine — don't destabilize a healthy app
|
||||||
|
};
|
||||||
|
match String::from_utf8_lossy(&out.stdout).trim().parse::<i32>() {
|
||||||
|
// A genuinely running container always has a supervised PID > 0 whose
|
||||||
|
// /proc entry exists. A dead PID (or PID <= 0 alongside state "running")
|
||||||
|
// is the anomaly we're catching.
|
||||||
|
Ok(pid) => pid_is_alive(pid),
|
||||||
|
Err(_) => true, // unparseable (older podman / odd output) — assume alive
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
async fn wait_for_container_stable_running(
|
async fn wait_for_container_stable_running(
|
||||||
runtime: &dyn ContainerRuntimeTrait,
|
runtime: &dyn ContainerRuntimeTrait,
|
||||||
name: &str,
|
name: &str,
|
||||||
@ -1450,6 +1493,26 @@ impl ProdContainerOrchestrator {
|
|||||||
}
|
}
|
||||||
match status.state {
|
match status.state {
|
||||||
ContainerState::Running => {
|
ContainerState::Running => {
|
||||||
|
// Zombie guard: podman can report a container "running"
|
||||||
|
// after its process has died (conmon SIGKILLed in a
|
||||||
|
// cgroup cascade on archipelago restart, etc.). Such a
|
||||||
|
// container serves nothing yet would be NoOp'd forever.
|
||||||
|
// Recreate it from the manifest. This is the ONLY path
|
||||||
|
// that recovers a dead dependency with no published host
|
||||||
|
// port (netbird-dashboard on .228, 2026-06-25 — stale
|
||||||
|
// "Up" → proxy 502 → NetBird login broke). Conservative:
|
||||||
|
// only fires on a concrete dead PID, never on uncertainty.
|
||||||
|
if !container_running_process_alive(&name).await {
|
||||||
|
tracing::warn!(
|
||||||
|
app_id = %app_id,
|
||||||
|
container = %name,
|
||||||
|
"container reported running but its process is dead (zombie) — recreating"
|
||||||
|
);
|
||||||
|
let _ = self.runtime.stop_container(&name).await;
|
||||||
|
let _ = self.runtime.remove_container(&name).await;
|
||||||
|
self.install_fresh(lm).await?;
|
||||||
|
return Ok(ReconcileAction::Installed);
|
||||||
|
}
|
||||||
// App-specific hooks get a chance to refresh bind-mounted
|
// App-specific hooks get a chance to refresh bind-mounted
|
||||||
// config. bitcoin-ui: re-render nginx.conf if the RPC
|
// config. bitcoin-ui: re-render nginx.conf if the RPC
|
||||||
// password rotated (or template changed via OTA). If
|
// password rotated (or template changed via OTA). If
|
||||||
@ -4829,4 +4892,17 @@ app:
|
|||||||
)
|
)
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn pid_is_alive_detects_live_and_dead_pids() {
|
||||||
|
// Our own process is alive.
|
||||||
|
assert!(pid_is_alive(std::process::id() as i32));
|
||||||
|
// Non-positive PIDs are never alive (a "running" container with PID 0 is
|
||||||
|
// exactly the zombie case).
|
||||||
|
assert!(!pid_is_alive(0));
|
||||||
|
assert!(!pid_is_alive(-1));
|
||||||
|
// A PID far above the kernel's pid_max can't name a live process, so the
|
||||||
|
// zombie guard reports it dead → the reconciler recreates.
|
||||||
|
assert!(!pid_is_alive(2_000_000_000));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
@ -121,6 +121,10 @@ impl PodmanClient {
|
|||||||
"cryptpad" => "http://localhost:3003",
|
"cryptpad" => "http://localhost:3003",
|
||||||
"penpot" => "http://localhost:9001",
|
"penpot" => "http://localhost:9001",
|
||||||
"immich_server" | "immich" => "http://localhost:2283",
|
"immich_server" | "immich" => "http://localhost:2283",
|
||||||
|
// Gitea publishes SSH (2222) and web (3001). Without a manifest on
|
||||||
|
// disk, extract_lan_address() returns whichever podman lists first —
|
||||||
|
// which can be the SSH port, breaking the launch. Pin the web UI.
|
||||||
|
"gitea" => "http://localhost:3001",
|
||||||
"nginx-proxy-manager" => "http://localhost:8081",
|
"nginx-proxy-manager" => "http://localhost:8081",
|
||||||
"fedimint-gateway" => "http://localhost:8176",
|
"fedimint-gateway" => "http://localhost:8176",
|
||||||
"endurain" => "http://localhost:8080",
|
"endurain" => "http://localhost:8080",
|
||||||
|
|||||||
@ -7,7 +7,7 @@
|
|||||||
> in §6 / §8b. Next exit-criteria: multinode (`docs/multinode-testing-plan.md`) +
|
> in §6 / §8b. Next exit-criteria: multinode (`docs/multinode-testing-plan.md`) +
|
||||||
> workstreams B/C/D.
|
> workstreams B/C/D.
|
||||||
>
|
>
|
||||||
> Last updated: 2026-06-23 · **.228 gate 5×-GREEN (110/110 ×5, 0 not-ok)** — exit criterion met (see §8b).
|
> Last updated: 2026-06-26 · zombie-container guard + gitea launch-port fix shipped, binary `040df5ce` rolled to the fleet (see §8b SESSION h). Prior: orchestrator Fix A+B (`a721532f`/`e0343137`) deployed + proven.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
@ -243,9 +243,102 @@ hardening; paid swarm streaming + IndeeHub source (`phase4-streaming-ecash-plan.
|
|||||||
Meshroller Rust-native mesh AI (`meshroller-integration-design.md`); dual-ecash
|
Meshroller Rust-native mesh AI (`meshroller-integration-design.md`); dual-ecash
|
||||||
phases 2–6 (`dual-ecash-design.md`).
|
phases 2–6 (`dual-ecash-design.md`).
|
||||||
|
|
||||||
## 8b. SESSION STATE + RESUME (updated 2026-06-23) — READ §8b "CURRENT STATE + RESUME" FIRST
|
## 8b. SESSION STATE + RESUME (updated 2026-06-26) — READ §8b "CURRENT STATE + RESUME" FIRST
|
||||||
|
|
||||||
### ▶ SESSION b (2026-06-23 PM) — LATEST, RESUME FROM HERE
|
### ▶ SESSION h (2026-06-26) — LATEST, RESUME FROM HERE
|
||||||
|
|
||||||
|
**Canonical resume detail: memory `project_session_resume_2026_06_23b` (▶️ top of MEMORY.md).**
|
||||||
|
Local main = `670ebb06` (3 commits past the previously-pushed `43e70049`: `0a8db904` zombie
|
||||||
|
guard + `670ebb06` gitea launch-port fix; `43e70049` webview was already pushed). **Combined
|
||||||
|
release binary `040df5ce2551d17b` rolled to the fleet.** Binary+FE not in git — rebuild on a
|
||||||
|
fresh machine (`cd core && CARGO_INCREMENTAL=0 cargo build --release -p archipelago`).
|
||||||
|
|
||||||
|
**DONE this session:**
|
||||||
|
1. ✅ **Zombie-container guard** (`0a8db904`) — the reconciler's Running branch now verifies a
|
||||||
|
container's `State.Pid` is alive (`/proc/<pid>` exists) before trusting podman's "Up"; on a
|
||||||
|
concrete dead PID it stop+remove+`install_fresh` from the manifest. Conservative: any
|
||||||
|
uncertainty (inspect fail / unparseable PID) assumes alive, so a transient hiccup never
|
||||||
|
destroys a healthy container. Fixes the class that broke NetBird login on .228 (dashboard
|
||||||
|
"Up" w/ dead PID → proxy 502, no host port → reconciler never recovered it). Unit test +
|
||||||
|
**live-proven on .228**: synthetic zombie on `jellyfin` (killed conmon+PID → podman still
|
||||||
|
"Up") → guard logged `…process is dead (zombie) — recreating app_id=jellyfin` → recreated →
|
||||||
|
settled to NoOp. **Zero false-positives across the other 33 healthy containers.**
|
||||||
|
2. ✅ **Gitea launch-port fix** (`670ebb06`) — gitea launched at **:2222 (SSH)** instead of
|
||||||
|
**:3001 (web)** on nodes without the gitea manifest on disk (`manifest_lan_address_for`
|
||||||
|
returns None → fell through to `extract_lan_address`, which returns podman's first-listed
|
||||||
|
port; podman lists `2222->22` before `3001->3000`). Added `"gitea" => http://localhost:3001`
|
||||||
|
to the static `lan_address_for` map (`core/container/src/podman_client.rs`) like every other
|
||||||
|
core app. Reported on tailscale node **100.82.34.38** — that node still needs the new binary
|
||||||
|
(or a refreshed gitea manifest) to pick it up.
|
||||||
|
3. ✅ **Rolled `040df5ce`** to .228/.116/.198/.89 (verified sha+active); .88/.5/.120 rolling.
|
||||||
|
|
||||||
|
**OPEN follow-ups (logged, NOT regressions):**
|
||||||
|
- **mempool env-drift recreate-loop on .228** — reconciler logs `container env drift detected —
|
||||||
|
recreating app_id=mempool` every ~30-90s, never converges (pre-existing; the known mempool
|
||||||
|
nginx stale-IP class, [[project_mempool_nginx_stale_ip_fix]]). mempool stays running but churns.
|
||||||
|
- **nostr-rs-relay** stuck "Stopping" + ~2s create-loop on .228 (from session g).
|
||||||
|
|
||||||
|
**NEXT:** finish .88/.5/.120 roll → push main to gitea-vps2 → Phase-3 quadlet / Workstream F /
|
||||||
|
multinode. SSH/sudo pw `ThisIsWeb54321@` (**.88 = `ThisIsWeb54321!`**); UI/RPC .228/.198 =
|
||||||
|
`ThisIsWeb54321@`. Reusable tooling in scratchpad: `deploy-bin.sh`/`remote-apply.sh` (EXPECT_SHA
|
||||||
|
= `040df5ce…`), `rpc.sh`.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ▶ SESSION g (2026-06-25) — earlier, historical
|
||||||
|
|
||||||
|
**Canonical resume detail: memory `project_session_resume_2026_06_23b` + `project_netbird_ph4_legacy_deletion_map` + `project_workstream_f_lifecycle_perfection`.**
|
||||||
|
`gitea-vps2/main = a721532f` (pushed). **Local main = `89d397bb`** (2 new commits this session, NOT pushed/deployed: `41e7f500` harness tolerance + `89d397bb` netbird ph4 legacy delete). Binary+FE are NOT in git — rebuild on a fresh machine.
|
||||||
|
|
||||||
|
**TL;DR (SESSION g, 2026-06-25) — everything below DONE this session:**
|
||||||
|
1. ✅ **Rolled** `e0343137` + fresh FE (`index-a75rd6Hy.js`) to **7 nodes** (.116/.198/.228/.89/.88/.5/.120), all verified. **.15 SKIPPED** (auth rejected — creds don't match).
|
||||||
|
2. ✅ **Harness tolerance fixes COMMITTED** `41e7f500` (run-gate settle/immich + immich.bats 90s + mempool.bats poll).
|
||||||
|
3. ✅ **mempool RESOLVED** fleet-wide — see mempool note below.
|
||||||
|
4. ✅ **netbird #20 ph4 DONE** — legacy Rust installer DELETED, committed `89d397bb` (492 lines gone, manifest-driven only, `cargo check` clean). Release binary BUILDING for the .228 live-verify (build left running — check after).
|
||||||
|
|
||||||
|
**NEXT (resume here):** (a) check the release build, deploy the `89d397bb` binary to .228, live-verify netbird adopts via manifest (https:8087→200, no `bail!`); (b) roll `89d397bb` to the rest of the fleet (behavior-neutral — manifest path already executed); (c) **push local main → gitea-vps2** (2 commits ahead); then **Phase-3 `use_quadlet_backends` → Workstream F → multinode**.
|
||||||
|
|
||||||
|
**ROLL RESULTS (2026-06-25, binary `e0343137b99bf066` + fresh FE bundled):**
|
||||||
|
| Node | Result |
|
||||||
|
|------|--------|
|
||||||
|
| .228 | ✅ already on `e0343137` (prior session, binary-only) |
|
||||||
|
| .116 (local) | ✅ binary + fresh FE; 36 containers survived restart; UI 200; `index-a75rd6Hy.js` live |
|
||||||
|
| .198 (LAN) | ✅ binary + fresh FE; 38 containers up; UI 200 |
|
||||||
|
| .89 (100.89.209.89) | ✅ binary + fresh FE; service active |
|
||||||
|
| .88 (100.70.96.88, pw `ThisIsWeb54321!`) | ✅ binary + fresh FE; service active |
|
||||||
|
| .5 (100.72.136.5) | ⏳ attempted — see resume note (cellular x250) |
|
||||||
|
| .120 (100.66.157.120) | ⏳ attempted — see resume note (cellular x250) |
|
||||||
|
| .15 (100.64.83.15, archy-dev-pa) | ❌ SKIPPED — `archipelago@` + `ThisIsWeb54321@` rejected (`Permission denied (publickey,password)`); node creds unknown |
|
||||||
|
|
||||||
|
Deploy tooling (reusable): scratchpad `deploy-bin.sh <label> <local\|ssh\|ts> <host> <pw>` + `remote-apply.sh` (mv binary avoids ETXTBSY, atomic FE swap preserving `aiui`/APK/`claude-login.html`, chown 1000:1000, restart, sha+health verify). Frontend tarball = `tar -C web/dist/neode-ui -czf neode-ui.tgz .` (flat). Full sha `e0343137b99bf06642c45da67bb092e9a411190ff59eda8e5177c2a06b6f6e89`.
|
||||||
|
|
||||||
|
**Focus: validate the two UNVALIDATED-WIP orchestrator fixes (commit `a721532f`) on the .228 canary, then roll to the 7-node fleet.**
|
||||||
|
- **Fix A** — desired-state recovery: a was-running app that vanished (e.g. lost through a failed teardown + reboot) auto-recreates on reconcile, via new `crash_recovery::load_last_running_names` (reads `running-containers.json` sans PID gate) + exact container-name match in `reconcile_all_with_mode`. Zero false-positives (uninstalled/user-stopped excluded).
|
||||||
|
- **Fix B** — recreate volume-ownership: a freshly-created bind dir for a NO-`data_uid` app gets `chown --reference=<parent>` so container-root can write → kills the immich-class recreate EACCES crash-loop. Only fresh dirs (zero regression for existing installs).
|
||||||
|
|
||||||
|
VALIDATION PROGRESS (sessions e→f):
|
||||||
|
1. ✅ Release binary built — sha16 `e0343137b99bf066` (differs from pre-fix `f2aa2fab` → fixes compiled in).
|
||||||
|
2. ✅ `cargo test -p archipelago crash_recovery` — **13/13 green**, incl. the two new Fix A tests.
|
||||||
|
3. ✅ Deployed new binary to **.228 canary** (binary-only; FE unchanged at `435b9f92`). Verified live sha `e0343137`, active, RPC OK. Container cgroup confirmed in `user@1000.service` (NOT archipelago.service) → `systemctl stop` is container-safe on .228.
|
||||||
|
4. ✅ **Fix A PROVEN** — `podman rm -f jellyfin` (non-baseline, no-data_uid) → periodic ExistingOnly reconciler (30s) recreated it; journal: `previously-running app has no container after boot — recreating (desired-state recovery) app_id=jellyfin`.
|
||||||
|
5. ✅ **Fix B PROVEN** — fresh `package.install uptime-kuma` (no-data_uid, no prior data dir) → bind dir chowned to parent owner `1000:1000` (NOT root:root), state=running, RestartCount=0, no EACCES, app wrote its own subdirs → clean uninstall (container+data-dir gone). all-apps matrix read-only **5/5 (17 apps)**.
|
||||||
|
6. 🟡 **5× DESTRUCTIVE gate on .228 — NOT yet 5/5, but failures are HARNESS-TOLERANCE FLAKES, NOT Fix A/B regressions** (proven: Fix A logged **0** desired-state-recovery firings during the failures; immich/lnd `RestartCount: 0`, no crashes). Under sustained 5× churn on this 34-app node a *different* heavy-app recovery probe slips each iteration:
|
||||||
|
- immich `lan_address` (test 64): 30s probe too tight after archipelago-restart recovery. **FIXED** (settle_stack now waits on immich :2283 when present, cap 180→300s; test 64 deadline 30→90s). Went **ok/ok/ok 3×** after fix.
|
||||||
|
- mempool orphan count (test 82): single-shot count caught a transient extra container mid-recreate (clears to 3=3). **FIXED locally** (poll for steady-state ≤30s) — fix is in local `tests/lifecycle/bats/mempool.bats`, NOT yet re-gated.
|
||||||
|
- lnd `getinfo recovers after restart` (test 77): already has a generous 240s deadline; peak concurrent load occasionally beats it. lnd itself **HEALTHY** (wallet unlocked — "wallet already unlocked, WalletUnlocker no longer available", RestartCount 0). Likely needs deadline bump or lnd added to within-iteration tolerance. **NOT yet fixed.**
|
||||||
|
- NOTE: the 300s settle bump made iterations very long (iter2=1062s) and a diagnostic run wedged in iter3; killed it. Re-think settle (maybe per-app readiness with shorter caps) before the next run.
|
||||||
|
7. ✅ **DECISION RESOLVED (2026-06-25):** user chose **(B) roll now** AND bundle the fresh UX frontend (per `feedback_deploy_targets_and_ux_bundle`). Gate load-robustness deferred to a separate hardening pass.
|
||||||
|
8. ✅ **ROLLED** `e0343137` + fresh FE (`index-a75rd6Hy.js`) to .116/.198/.89/.88/.5/.120 (.228 already on it) — all verified `sha=e0343137`, service active. **.15 skipped** (auth reject). See roll table above.
|
||||||
|
9. ✅ **Harness fixes COMMITTED** `41e7f500` (no longer uncommitted).
|
||||||
|
10. ✅ **netbird #20 ph4 — legacy installer DELETED**, committed `89d397bb`. `install_netbird_stack` is now orchestrator-manifest → adopt → `bail!` (no in-Rust installer); removed 6 dead helpers + 3 `NETBIRD_*_IMAGE` consts + unused import (~492 lines). `cargo check` clean (0 warnings). Manifest path verified live pre-delete (.228 https:8087→200). **Release binary BUILT: sha `cccb7cfd9c38a651`** (`core/target/release/archipelago`, supersedes `e0343137`) — NOT yet deployed; deploy to .228 + live-verify then roll. Map+rationale: memory `project_netbird_ph4_legacy_deletion_map`. **Pre-existing follow-up (NOT introduced by delete): the manifest path lacks an active #10 OIDC-readiness gate — if that login race resurfaces, add an OIDC-ready gate to the netbird manifest.**
|
||||||
|
|
||||||
|
**✅ 2026-06-25 — STRAY 13h GATE on .228 found + killed; mempool RESOLVED.** A `setsid` gate run from session-e was still churning .228 ~13h later (pathologically slow — only reached test 71/lnd; the 300s settle bump is the suspect). Killed its process group (note: `pkill -f bats` self-matches the ssh command's own argv → kill by numeric PID/PGID instead). After kill, `crash_recovery` (Fix A) auto-recovered the immich/indeedhub/netbird stacks — **good live exercise of Fix A**. **mempool fallout RESOLVED:** the gate churn left .228's podman **overlay storage corrupt** (mempool frontend crash-looped — container couldn't write `/etc/nginx`, same image serves fine on .116) → **fixed by rebooting .228** (clears overlay corruption; Fix A staggered-recovered all apps; mempool stable 200). **.198 is PRUNED** bitcoin → mempool requires archival (install correctly refused) → **cleanly uninstalled** the orphan mempool-db. All nodes now correct. LESSON: never leave the gate running unsupervised; reconsider the 300s settle before re-running.
|
||||||
|
|
||||||
|
Fleet on `e0343137` + FE `index-a75rd6Hy.js` on .116/.198/.228/.89/.88/.5/.120 (.15 still old). **`89d397bb` (netbird-delete) binary NOT yet deployed anywhere — verify on .228 then roll.** SSH/sudo pw UNIFORM `ThisIsWeb54321@` (**.88 = `ThisIsWeb54321!`**); **UI/RPC: .228=`ThisIsWeb54321@`, .198=`ThisIsWeb54321@`.** Reusable tooling in scratchpad: `deploy-bin.sh`/`remote-apply.sh` (binary+FE swap), `rpc.sh <host> <pw> <method> [params]` (auth.login→call). Gate harness at `~/lifecycle/lifecycle` on .228 — **CHECK it isn't already running/wedged before re-launching**.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
### ▶ SESSION b (2026-06-23 PM) — earlier, historical
|
||||||
|
|
||||||
**Canonical resume detail: memory `project_session_resume_2026_06_23b` (▶️ top of MEMORY.md).**
|
**Canonical resume detail: memory `project_session_resume_2026_06_23b` (▶️ top of MEMORY.md).**
|
||||||
`gitea-vps2/main = 4346007d` pushed; local HEAD `e57514b6` (uninstall fix, committed, **not pushed/deployed**).
|
`gitea-vps2/main = 4346007d` pushed; local HEAD `e57514b6` (uninstall fix, committed, **not pushed/deployed**).
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user