From 97ce23d77368325e67bf4f20dd0f4d873f8ad238 Mon Sep 17 00:00:00 2001 From: archipelago Date: Sat, 2 May 2026 05:21:57 -0400 Subject: [PATCH] =?UTF-8?q?feat(quadlet):=20Phase=203.4=20=E2=80=94=20heal?= =?UTF-8?q?th-gated=20startup=20via=20Notify=3Dhealthy?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit QuadletUnit gains an optional HealthSpec; from_manifest translates the manifest's health_check (tcp/http/cmd) into a HealthCmd= directive and emits Notify=healthy alongside it. systemctl start .service then blocks until the container's first green probe — eliminating the "container up but RPC not ready" race the orchestrator currently papers over with post-start polling. Translation policy: * tcp, endpoint "host:port" -> nc -z host port * http, endpoint "host:port", path -> curl -fsS -m 5 http://endpoint * cmd, endpoint "" -> verbatim * unknown type / malformed endpoint -> None (skip Notify=healthy rather than emit a HealthCmd that hangs the unit start forever) Companion units leave health: None and remain byte-identical to before this PR — the renderer only emits the Health* / Notify= block when set. +4 quadlet unit tests (19 total). Dropped a never-used test setter that was generating a dead_code warning. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../src/container/prod_orchestrator.rs | 8 - core/archipelago/src/container/quadlet.rs | 189 ++++++++++++++++++ tests/lifecycle/TESTING.md | 4 +- 3 files changed, 191 insertions(+), 10 deletions(-) diff --git a/core/archipelago/src/container/prod_orchestrator.rs b/core/archipelago/src/container/prod_orchestrator.rs index 4f722a61..1abc5dee 100644 --- a/core/archipelago/src/container/prod_orchestrator.rs +++ b/core/archipelago/src/container/prod_orchestrator.rs @@ -210,14 +210,6 @@ impl ProdContainerOrchestrator { } } - /// Test-only setter for the Phase 3.2 feature flag, so unit tests - /// can exercise the Quadlet-backend install path without going - /// through the full Config plumbing. - #[cfg(test)] - pub fn set_use_quadlet_backends(&mut self, on: bool) { - self.use_quadlet_backends = on; - } - /// Override the bitcoin-ui render paths (secret + output). Only used /// by tests that exercise the bitcoin-ui pre-start hook — the /// default `/var/lib/archipelago/...` paths are correct for prod. diff --git a/core/archipelago/src/container/quadlet.rs b/core/archipelago/src/container/quadlet.rs index fd1af134..5c3a06b8 100644 --- a/core/archipelago/src/container/quadlet.rs +++ b/core/archipelago/src/container/quadlet.rs @@ -86,6 +86,24 @@ impl RestartPolicy { } } +/// Container healthcheck wired through to systemd via `Notify=healthy`. +/// When set, `systemctl start .service` blocks until the container's +/// own healthcheck reports green — eliminating the "container up but RPC +/// not ready" race that the orchestrator currently papers over with +/// post-start polling. +/// +/// Ranges roughly mirror the manifest's HealthCheck struct: `cmd` is the +/// shell form (`/usr/bin/curl -fsS http://localhost:8332/health` etc.), +/// `interval`/`timeout` use systemd time format ("30s", "5m"), `retries` +/// is the consecutive-failures threshold before "unhealthy" trips. +#[derive(Debug, Clone)] +pub struct HealthSpec { + pub cmd: String, + pub interval: String, + pub timeout: String, + pub retries: u32, +} + /// One Quadlet `.container` unit. Field set is deliberately small — /// add a new field only when a real manifest needs it. #[derive(Debug, Clone, Default)] @@ -101,6 +119,10 @@ pub struct QuadletUnit { pub bind_mounts: Vec, pub extra_podman_args: Vec, pub depends_on: Vec, + /// Phase 3.4: when present the rendered unit emits HealthCmd=, + /// HealthInterval=, HealthTimeout=, HealthRetries=, AND Notify=healthy + /// so systemctl start blocks on a green health probe. + pub health: Option, // Backend-manifest extensions (Phase 3.1). Companion units leave // these defaulted; the renderer skips empty/false directives so a // companion's rendered bytes are unchanged from before this PR. @@ -203,6 +225,17 @@ impl QuadletUnit { if let Some(cpus) = self.cpu_quota { let _ = writeln!(s, "PodmanArgs=--cpus={cpus}"); } + if let Some(h) = &self.health { + let _ = writeln!(s, "HealthCmd={}", h.cmd); + let _ = writeln!(s, "HealthInterval={}", h.interval); + let _ = writeln!(s, "HealthTimeout={}", h.timeout); + let _ = writeln!(s, "HealthRetries={}", h.retries); + // Notify=healthy: systemd treats the unit as "started" only + // after the first green health probe. Start ordering + // (Requires=/After=) downstream of this unit therefore + // doesn't fire until the app is actually serving requests. + let _ = writeln!(s, "Notify=healthy"); + } if let Some(ep) = &self.entrypoint { // Quadlet's Exec= replaces the image entrypoint+cmd. When // the manifest provides both entrypoint and command we @@ -306,6 +339,7 @@ impl QuadletUnit { bind_mounts, extra_podman_args: vec![], depends_on: vec![], + health: app.health_check.as_ref().and_then(translate_health_check), ports: app .ports .iter() @@ -324,6 +358,44 @@ impl QuadletUnit { } } +/// Translate the manifest's HealthCheck shape into a HealthSpec the +/// renderer understands. Returns None when the manifest's health spec +/// is malformed or unsupported — we'd rather skip Notify=healthy than +/// emit a broken HealthCmd that fails the unit start forever. +/// +/// Supported shapes: +/// - type: tcp, endpoint: "host:port" → `nc -z host port` +/// - type: http, endpoint: "host:port", path → `curl -fsS http://host:port` +/// - type: cmd, endpoint: "" → `` verbatim +fn translate_health_check( + hc: &archipelago_container::HealthCheck, +) -> Option { + let cmd = match hc.check_type.as_str() { + "tcp" => { + let endpoint = hc.endpoint.as_deref()?; + let (host, port) = endpoint.rsplit_once(':')?; + // nc is in busybox/coreutils on every base image we ship. + // The -z flag does a "scan" that exits 0 on connect, 1 otherwise. + format!("nc -z {host} {port}") + } + "http" => { + let endpoint = hc.endpoint.as_deref()?; + let path = hc.path.as_deref().unwrap_or("/"); + // -fsS: fail on non-2xx, silent except on error, show errors. + // -m 5: per-request timeout matches the default manifest timeout. + format!("curl -fsS -m 5 http://{endpoint}{path}") + } + "cmd" => hc.endpoint.as_deref()?.to_string(), + _ => return None, + }; + Some(HealthSpec { + cmd, + interval: hc.interval.clone(), + timeout: hc.timeout.clone(), + retries: hc.retries, + }) +} + /// Parse the manifest's memory_limit string into MiB. Recognises the /// forms our manifests actually use: "", "m"/"M", "g"/"G". /// Returns None for anything else; the caller treats None as unlimited. @@ -758,6 +830,123 @@ app: assert_eq!(u.bind_mounts[0].host, PathBuf::from("/var/lib/x")); } + #[test] + fn render_emits_health_directives_when_set() { + let mut u = QuadletUnit::default(); + u.name = "lnd".into(); + u.image = "x:1".into(); + u.health = Some(HealthSpec { + cmd: "nc -z localhost 10009".into(), + interval: "30s".into(), + timeout: "5s".into(), + retries: 3, + }); + let s = u.render(); + assert!(s.contains("HealthCmd=nc -z localhost 10009")); + assert!(s.contains("HealthInterval=30s")); + assert!(s.contains("HealthTimeout=5s")); + assert!(s.contains("HealthRetries=3")); + assert!(s.contains("Notify=healthy")); + } + + #[test] + fn render_skips_health_directives_when_absent() { + // No health spec → no Notify=healthy and no HealthCmd, so companion + // units (which never set health) keep their existing behavior: the + // unit is "started" the moment the process spawns. + let s = sample_unit().render(); + assert!(!s.contains("HealthCmd=")); + assert!(!s.contains("Notify=healthy")); + assert!(!s.contains("HealthRetries=")); + } + + #[test] + fn translate_health_check_handles_each_supported_type() { + use archipelago_container::HealthCheck; + let tcp = HealthCheck { + check_type: "tcp".into(), + endpoint: Some("localhost:10009".into()), + path: None, + interval: "30s".into(), + timeout: "5s".into(), + retries: 3, + }; + let h = translate_health_check(&tcp).expect("tcp must translate"); + assert_eq!(h.cmd, "nc -z localhost 10009"); + assert_eq!(h.retries, 3); + + let http = HealthCheck { + check_type: "http".into(), + endpoint: Some("localhost:8080".into()), + path: Some("/health".into()), + interval: "10s".into(), + timeout: "3s".into(), + retries: 5, + }; + let h = translate_health_check(&http).expect("http must translate"); + assert_eq!(h.cmd, "curl -fsS -m 5 http://localhost:8080/health"); + + let cmdck = HealthCheck { + check_type: "cmd".into(), + endpoint: Some("/usr/local/bin/probe.sh".into()), + path: None, + interval: "60s".into(), + timeout: "15s".into(), + retries: 2, + }; + let h = translate_health_check(&cmdck).expect("cmd must translate"); + assert_eq!(h.cmd, "/usr/local/bin/probe.sh"); + + // Unknown type → None (renderer skips Notify=healthy entirely + // rather than emit a broken HealthCmd that hangs the unit start). + let bad = HealthCheck { + check_type: "exec".into(), + endpoint: Some("foo".into()), + path: None, + interval: "30s".into(), + timeout: "5s".into(), + retries: 3, + }; + assert!(translate_health_check(&bad).is_none()); + + // Malformed tcp endpoint → None (no port separator). + let badtcp = HealthCheck { + check_type: "tcp".into(), + endpoint: Some("hostonly".into()), + path: None, + interval: "30s".into(), + timeout: "5s".into(), + retries: 3, + }; + assert!(translate_health_check(&badtcp).is_none()); + } + + #[test] + fn from_manifest_picks_up_health_check() { + let yaml = r#" +app: + id: lnd + name: LND + version: 1.0.0 + container: + image: x:1 + health_check: + type: tcp + endpoint: localhost:10009 + interval: 15s + timeout: 4s + retries: 5 +"#; + let m = AppManifest::parse(yaml).unwrap(); + let u = QuadletUnit::from_manifest(&m, "lnd"); + let h = u.health.as_ref().expect("health should be populated"); + assert_eq!(h.cmd, "nc -z localhost 10009"); + assert_eq!(h.interval, "15s"); + assert_eq!(h.timeout, "4s"); + assert_eq!(h.retries, 5); + assert!(u.render().contains("Notify=healthy")); + } + #[test] fn from_manifest_renders_to_a_systemd_unit() { // End-to-end: parse a real-shape manifest, build the unit, render diff --git a/tests/lifecycle/TESTING.md b/tests/lifecycle/TESTING.md index 4e8c3494..a527b7d5 100644 --- a/tests/lifecycle/TESTING.md +++ b/tests/lifecycle/TESTING.md @@ -54,7 +54,7 @@ v1.7.52 tags. | Layer | Tests | Suites | Status | |---|---:|---:|---| -| L0 unit | 624 | n/a | ● green | +| L0 unit | 628 | n/a | ● green | | L1 RPC | 70 | bitcoin-knots, lnd, electrumx, btcpay, mempool, fedimint, required-stack, package-update-smoke | ● for the 6 core apps | | L2 UI | 9 | ui-coverage | ● for dashboard + 7 proxy paths + bitcoin-ui:8334 | | L3 lifecycle survival | 8 | companion-survives-archipelago-restart, backend-survives-archipelago-restart, required-stack-destructive | ◐ companions ● ; backends ◐ regression-gate (will fail until Phase 3 Quadlet ships) | @@ -96,7 +96,7 @@ Goal: minimum-viable container subsystem. | `core/container/src/bitcoin_simulator.rs` | 219 | 0 | -219 | ○ couples with dev_orchestrator | | `core/container/src/port_manager.rs` | 175 | 0 | -175 | ○ couples with dev_orchestrator | | `core/archipelago/src/api/rpc/package/install.rs::install_bitcoincoin_rpc_repair` | ~150 | 0 | -150 | ◐ pending fold into orchestrator pre-start | -| imperative `install_fresh` in prod_orchestrator | ~120 | 0 | -120 | ◐ Phase 3.2 wired behind `use_quadlet_backends` flag (default off); 3.3 in-place migration ✅; flip default after 20× green | +| imperative `install_fresh` in prod_orchestrator | ~120 | 0 | -120 | ◐ Phase 3.2 wired behind `use_quadlet_backends` flag (default off); 3.3 in-place migration ✅; 3.4 health-gated startup (`Notify=healthy`) ✅; flip default after 20× green | **Today: -270 LoC committed. Outstanding deletes possible: ~1,616 LoC** (if Phase 3 ships fully + dev_mode resolved).