From bd7edb4376a0fbd55b6ca51320b53e33e43664cf Mon Sep 17 00:00:00 2001 From: archipelago Date: Sat, 4 Jul 2026 13:50:00 -0400 Subject: [PATCH] feat(update): deepen post-OTA verification beyond a frontend 200 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit verify_pending_update previously cleared the rollback marker on any 2xx/3xx from GET / — a release with a dead RPC API or broken podman access passed and never rolled back. Verification now requires, in the same attempt: the frontend via nginx, backend RPC liveness (an unauthenticated POST /rpc/v1 — 401 proves the stack is up, 5xx/404/ refused fails it), and rootless podman reachability. A pre-loop check also asserts the running binary's version matches what the marker says was applied, catching a silent or half swap deterministically. Per-app container assertions are deliberately excluded: the pre-Quadlet service restart legitimately takes containers down and the boot reconciler can need minutes for heavy apps — that would false-rollback healthy updates. Revisit after the Phase-3 flip. §B of the 1.8.0 hardening plan; update suite 38/38 green. Co-Authored-By: Claude Fable 5 --- core/archipelago/src/update.rs | 125 ++++++++++++++++++++++----- docs/1.8.0-RELEASE-HARDENING-PLAN.md | 14 +-- 2 files changed, 112 insertions(+), 27 deletions(-) diff --git a/core/archipelago/src/update.rs b/core/archipelago/src/update.rs index 5a0744d4..848e464a 100644 --- a/core/archipelago/src/update.rs +++ b/core/archipelago/src/update.rs @@ -513,8 +513,64 @@ async fn probe_frontend_once() -> Result<()> { anyhow::bail!("frontend probe returned HTTP {}", status); } +/// Probe the backend RPC API through nginx. An unauthenticated call is +/// EXPECTED to get 401/403 — any such response proves the Rust HTTP stack +/// is alive behind nginx. 5xx (backend dead → nginx 502), 404 (proxy +/// misroute), or connection errors mean the API is down even though the +/// static frontend may still serve — exactly the failure mode the plain +/// `GET /` probe waved through. +async fn probe_backend_once() -> Result<()> { + let client = reqwest::Client::builder() + .danger_accept_invalid_certs(true) + .timeout(std::time::Duration::from_secs(5)) + .build() + .context("build probe client")?; + let body = serde_json::json!({ "method": "update.status" }); + let resp = match client + .post("https://127.0.0.1/rpc/v1") + .json(&body) + .send() + .await + { + Ok(resp) => resp, + Err(e) if e.is_connect() => client + .post("http://127.0.0.1/rpc/v1") + .json(&body) + .send() + .await + .context("probe POST http://127.0.0.1/rpc/v1 (https not bound on loopback)")?, + Err(e) => return Err(e).context("probe POST https://127.0.0.1/rpc/v1"), + }; + let status = resp.status(); + if status.is_server_error() || status == reqwest::StatusCode::NOT_FOUND { + anyhow::bail!("backend RPC probe returned HTTP {}", status); + } + Ok(()) +} + +/// Probe that the rootless container runtime is reachable from the new +/// binary (uid mapping / podman socket intact after the swap). A healthy +/// node answers `podman ps` in well under a second. +async fn probe_container_runtime_once() -> Result<()> { + let out = tokio::process::Command::new("podman") + .args(["ps", "--format", "{{.Names}}"]) + .output() + .await + .context("spawn podman ps")?; + if !out.status.success() { + anyhow::bail!( + "podman ps exited {}: {}", + out.status, + String::from_utf8_lossy(&out.stderr).trim() + ); + } + Ok(()) +} + /// Called from main.rs startup. If a pending-verification marker is -/// present, probe the frontend; on failure, trigger rollback and +/// present, verify the node actually works on the new version — binary +/// version matches the marker, frontend serves, backend RPC answers, +/// rootless podman is reachable. On failure, trigger rollback and /// restart the service so the OLD binary boots. /// /// This is the "post-OTA auto-rollback" guardrail. If ANY problem in @@ -547,34 +603,59 @@ pub async fn verify_pending_update(data_dir: &Path) { info!( new_version = %marker.new_version, previous_version = %marker.previous_version, - "Post-OTA verification: probing frontend at https://127.0.0.1/" + "Post-OTA verification: probing frontend, backend RPC, and container runtime" ); - // Give the new service time to bind its listeners + nginx to - // pick up any config changes. 15s matches what we observed on - // .116 during the v1.7.40 rollout recovery. - tokio::time::sleep(std::time::Duration::from_secs(15)).await; - - let deadline = - std::time::Instant::now() + std::time::Duration::from_secs(PENDING_VERIFY_WINDOW_SECS); + // Binary identity check: if the running binary's version isn't the one + // the marker says we applied, the swap silently failed (or half-applied + // — new frontend with old binary). Deterministic, so no retry loop: + // fall through straight to rollback to restore a matched pair. + let running = env!("CARGO_PKG_VERSION"); let mut attempt = 0u32; let mut last_err: Option = None; + let version_ok = running == marker.new_version; + if !version_ok { + last_err = Some(format!( + "running binary is {} but marker says {} was applied — binary swap failed", + running, marker.new_version + )); + } else { + // Give the new service time to bind its listeners + nginx to + // pick up any config changes. 15s matches what we observed on + // .116 during the v1.7.40 rollout recovery. + tokio::time::sleep(std::time::Duration::from_secs(15)).await; - while std::time::Instant::now() < deadline { - attempt += 1; - match probe_frontend_once().await { - Ok(()) => { - info!(attempt, "Post-OTA verification succeeded — clearing marker"); - clear_pending_verification(data_dir).await; - return; - } - Err(e) => { - let msg = e.to_string(); - tracing::warn!(attempt, error = %msg, "Post-OTA probe failed, retrying"); - last_err = Some(msg); + let deadline = std::time::Instant::now() + + std::time::Duration::from_secs(PENDING_VERIFY_WINDOW_SECS); + + while std::time::Instant::now() < deadline { + attempt += 1; + // All three must pass in the same attempt: static frontend via + // nginx, backend RPC liveness, and rootless-podman reachability. + // (Individual app containers are NOT asserted — the pre-Quadlet + // service restart legitimately takes them down and the boot + // reconciler can need minutes to bring heavy apps back.) + let result = match probe_frontend_once().await { + Ok(()) => match probe_backend_once().await { + Ok(()) => probe_container_runtime_once().await, + Err(e) => Err(e), + }, + Err(e) => Err(e), + }; + match result { + Ok(()) => { + info!(attempt, "Post-OTA verification succeeded — clearing marker"); + clear_pending_verification(data_dir).await; + return; + } + Err(e) => { + let msg = format!("{e:#}"); + tracing::warn!(attempt, error = %msg, "Post-OTA probe failed, retrying"); + last_err = Some(msg); + } } + tokio::time::sleep(std::time::Duration::from_secs(5)).await; } - tokio::time::sleep(std::time::Duration::from_secs(5)).await; } tracing::error!( diff --git a/docs/1.8.0-RELEASE-HARDENING-PLAN.md b/docs/1.8.0-RELEASE-HARDENING-PLAN.md index ce9f253a..67a5c887 100644 --- a/docs/1.8.0-RELEASE-HARDENING-PLAN.md +++ b/docs/1.8.0-RELEASE-HARDENING-PLAN.md @@ -86,11 +86,15 @@ atomic swap, single-depth backup). The gaps are **authenticity** (§A) and **verification depth** — plus the fact that the upgrade path has never run end-to-end on real hardware. -- [ ] 🔴 **Deepen the post-OTA health check.** `update.rs:456` (`probe_frontend_once`) - passes on any 2xx/3xx from `GET /`, and `verify_pending_update` (494-593) only rolls - back on that. A release with a broken RPC API, dead containers, or failed LND unlock - passes and never rolls back. Add `/rpc/v1 update.status` + container-list/required-stack - health assertions before clearing the pending-verify marker. +- [x] 🔴 **Deepen the post-OTA health check.** DONE 2026-07-03: `verify_pending_update` + now requires, in the same attempt, (1) frontend 2xx/3xx via nginx, (2) backend RPC + liveness — unauthenticated POST `/rpc/v1`; 401/403 = alive, 5xx/404/refused = dead, + so a 502-behind-static-files release now rolls back, (3) rootless `podman ps` + reachability; plus a pre-loop binary-version==marker assertion that catches a silent + or half swap (new frontend + old binary) deterministically. Per-app container + assertions deliberately EXCLUDED — the pre-Quadlet service restart legitimately kills + containers and the reconciler can need minutes (false-rollback risk); revisit after + the Phase-3 flip. LND-unlock-level checks remain out of scope for the 90s window. - [ ] 🟠 **Run one real upgrade-from-vN-1 soak on hardware before tagging.** No test installs the previous version, points it at a staged 1.8.0 manifest, applies, and asserts health + rollback. This is the top release risk for an OTA release. A