diff --git a/core/archipelago/src/api/rpc/package/dependencies.rs b/core/archipelago/src/api/rpc/package/dependencies.rs index 05a89fa7..ccb8e752 100644 --- a/core/archipelago/src/api/rpc/package/dependencies.rs +++ b/core/archipelago/src/api/rpc/package/dependencies.rs @@ -472,7 +472,7 @@ pub(super) async fn check_bitcoin_pruning_compatibility(package_id: &str) -> Res tokio::time::sleep(std::time::Duration::from_secs(2)).await; } - if detect_disk_gb() < ARCHIVAL_BITCOIN_DISK_GB { + if detect_disk_gb().await < ARCHIVAL_BITCOIN_DISK_GB { anyhow::bail!(archival_bitcoin_required_message(package_id)); } @@ -497,10 +497,11 @@ fn check_blockchain_info_for_pruning(package_id: &str, json: &serde_json::Value) Ok(()) } -fn detect_disk_gb() -> u64 { - let output = std::process::Command::new("df") +async fn detect_disk_gb() -> u64 { + let output = tokio::process::Command::new("df") .args(["-BG", "/var/lib/archipelago"]) - .output(); + .output() + .await; let Ok(output) = output else { return u64::MAX; }; diff --git a/core/archipelago/src/api/rpc/package/install.rs b/core/archipelago/src/api/rpc/package/install.rs index e01598ab..8dc43714 100644 --- a/core/archipelago/src/api/rpc/package/install.rs +++ b/core/archipelago/src/api/rpc/package/install.rs @@ -2196,13 +2196,14 @@ async fn ensure_host_port_listener( container_name: &str, runtime_ports: &[String], ) -> Result<()> { - let Some(port) = runtime_ports + let mut port = runtime_ports .first() .and_then(|p| p.split(':').next()) - .and_then(|p| p.parse::().ok()) - .or_else(|| published_host_port(container_name)) - .or_else(|| required_host_port(package_id)) - else { + .and_then(|p| p.parse::().ok()); + if port.is_none() { + port = published_host_port(container_name).await; + } + let Some(port) = port.or_else(|| required_host_port(package_id)) else { return Ok(()); }; @@ -2248,10 +2249,11 @@ async fn ensure_host_port_listener( )) } -fn published_host_port(container_name: &str) -> Option { - let output = std::process::Command::new("podman") +async fn published_host_port(container_name: &str) -> Option { + let output = tokio::process::Command::new("podman") .args(["port", container_name]) .output() + .await .ok()?; if !output.status.success() { return None; diff --git a/core/archipelago/src/api/rpc/system/handlers.rs b/core/archipelago/src/api/rpc/system/handlers.rs index 512f2f3a..5ddfa4a4 100644 --- a/core/archipelago/src/api/rpc/system/handlers.rs +++ b/core/archipelago/src/api/rpc/system/handlers.rs @@ -575,7 +575,7 @@ impl RpcHandler { // Restart the service via systemd tokio::spawn(async { tokio::time::sleep(std::time::Duration::from_secs(2)).await; - let _ = std::process::Command::new("sudo") + let _ = tokio::process::Command::new("sudo") .args(["systemctl", "restart", "archipelago"]) .spawn(); }); diff --git a/core/archipelago/src/config.rs b/core/archipelago/src/config.rs index dea0a949..ad1a1b2d 100644 --- a/core/archipelago/src/config.rs +++ b/core/archipelago/src/config.rs @@ -81,10 +81,11 @@ pub struct Config { impl Config { /// Detect primary host IP (first non-loopback IPv4) - fn detect_host_ip() -> Result { - let output = std::process::Command::new("hostname") + async fn detect_host_ip() -> Result { + let output = tokio::process::Command::new("hostname") .args(["-I"]) .output() + .await .context("Failed to run hostname -I")?; let s = String::from_utf8_lossy(&output.stdout); let ip = s @@ -210,7 +211,9 @@ impl Config { if let Ok(ip) = std::env::var("ARCHIPELAGO_HOST_IP") { config.host_ip = ip; } else { - config.host_ip = Self::detect_host_ip().unwrap_or_else(|_| "127.0.0.1".to_string()); + config.host_ip = Self::detect_host_ip() + .await + .unwrap_or_else(|_| "127.0.0.1".to_string()); } // Ensure data directory exists diff --git a/core/archipelago/src/container/prod_orchestrator.rs b/core/archipelago/src/container/prod_orchestrator.rs index 5825afa6..8fdbe869 100644 --- a/core/archipelago/src/container/prod_orchestrator.rs +++ b/core/archipelago/src/container/prod_orchestrator.rs @@ -1440,7 +1440,7 @@ impl ProdContainerOrchestrator { .collect() }; let mut report = ReconcileReport::default(); - let disk_gb = self.disk_gb(); + let disk_gb = self.disk_gb().await; // Register every candidate before the (sequential, possibly slow) // pass so the scanner overlays queued-but-down apps as Restarting // instead of Stopped. Each app is deregistered as its turn finishes, @@ -1620,7 +1620,7 @@ impl ProdContainerOrchestrator { } let mut resolved_manifest = lm.manifest.clone(); - self.resolve_dynamic_env(&mut resolved_manifest)?; + self.resolve_dynamic_env(&mut resolved_manifest).await?; let name = compute_container_name(&lm.manifest); // An explicitly user-stopped app MUST stay stopped. The reconcile filter @@ -1970,7 +1970,7 @@ impl ProdContainerOrchestrator { async fn install_fresh(&self, lm: &LoadedManifest) -> Result<()> { self.ensure_app_secrets(&lm.manifest.app.id).await?; let mut resolved_manifest = lm.manifest.clone(); - self.resolve_dynamic_env(&mut resolved_manifest)?; + self.resolve_dynamic_env(&mut resolved_manifest).await?; resolve_catalog_image(&mut resolved_manifest); let resolved = resolved_manifest.app.container.resolve().ok_or_else(|| { @@ -2263,7 +2263,7 @@ impl ProdContainerOrchestrator { // Re-render the manifest with dynamic env baked in, then go // through the same install path a fresh install would. let mut resolved = lm.manifest.clone(); - self.resolve_dynamic_env(&mut resolved)?; + self.resolve_dynamic_env(&mut resolved).await?; self.install_via_quadlet(&resolved, name) .await .with_context(|| format!("Phase 3.3: re-install {name} via Quadlet"))?; @@ -2329,7 +2329,7 @@ impl ProdContainerOrchestrator { let restart_required = quadlet::contains_stale_health_gate(&old_body); let mut resolved = lm.manifest.clone(); - self.resolve_dynamic_env(&mut resolved)?; + self.resolve_dynamic_env(&mut resolved).await?; // Same catalog/pinned-version image resolution the installer applies, so // the drift re-render doesn't revert a pinned version back to the // manifest's shipped `:latest` tag on the next reconcile tick. @@ -2404,7 +2404,7 @@ impl ProdContainerOrchestrator { for dep in dependencies { let mut resolved = dep.manifest.clone(); - self.resolve_dynamic_env(&mut resolved)?; + self.resolve_dynamic_env(&mut resolved).await?; let name = compute_container_name(&dep.manifest); if self.runtime.get_container_status(&name).await.is_err() { continue; @@ -2588,7 +2588,7 @@ impl ProdContainerOrchestrator { } .read("bitcoin-rpc-password") .context("lnd pre-start: read bitcoin RPC password")?; - let bitcoin_host = self.bitcoin_host(); + let bitcoin_host = self.bitcoin_host().await; let outcome = lnd::ensure_config(&self.lnd_paths, &rpc_pass, &bitcoin_host) .await .context("lnd pre-start: ensure lnd.conf")?; @@ -2706,10 +2706,12 @@ impl ProdContainerOrchestrator { tokio::time::sleep(std::time::Duration::from_secs(1)).await; } - fn detect_host_facts(&self) -> HostFacts { - let host_ip = Self::detect_host_ip().unwrap_or_else(|| "127.0.0.1".to_string()); - let host_mdns = Self::detect_host_mdns(); - let disk_gb = self.disk_gb(); + async fn detect_host_facts(&self) -> HostFacts { + let host_ip = Self::detect_host_ip() + .await + .unwrap_or_else(|| "127.0.0.1".to_string()); + let host_mdns = Self::detect_host_mdns().await; + let disk_gb = self.disk_gb().await; HostFacts { host_ip, host_mdns, @@ -2723,9 +2725,8 @@ impl ProdContainerOrchestrator { /// Container name of the running Bitcoin node (`bitcoin-knots` or /// `bitcoin-core`) for the `{{BITCOIN_HOST}}` derived-env placeholder. - /// Synchronous `podman ps` to match the surrounding host-fact detection; - /// defaults to `bitcoin-knots` when none is running (B12). - fn bitcoin_host(&self) -> String { + /// Defaults to `bitcoin-knots` when none is running (B12). + async fn bitcoin_host(&self) -> String { #[cfg(test)] if let Some(host) = &self.test_bitcoin_host { return host.clone(); @@ -2733,9 +2734,10 @@ impl ProdContainerOrchestrator { // Mirrors api::rpc::package::dependencies (the legacy install path); // both Bitcoin node variants are reachable on archy-net by name. const BITCOIN_NAMES: &[&str] = &["bitcoin-knots", "bitcoin-core", "bitcoin"]; - let names = Command::new("podman") + let names = tokio::process::Command::new("podman") .args(["ps", "--format", "{{.Names}}"]) .output() + .await .ok() .filter(|o| o.status.success()) .map(|o| String::from_utf8_lossy(&o.stdout).into_owned()) @@ -2753,8 +2755,12 @@ impl ProdContainerOrchestrator { self.test_bitcoin_host = Some(host.to_string()); } - fn detect_host_ip() -> Option { - let output = Command::new("hostname").arg("-I").output().ok()?; + async fn detect_host_ip() -> Option { + let output = tokio::process::Command::new("hostname") + .arg("-I") + .output() + .await + .ok()?; if !output.status.success() { return None; } @@ -2762,9 +2768,10 @@ impl ProdContainerOrchestrator { stdout.split_whitespace().next().map(|s| s.to_string()) } - fn detect_host_mdns() -> String { - let hostname = Command::new("hostname") + async fn detect_host_mdns() -> String { + let hostname = tokio::process::Command::new("hostname") .output() + .await .ok() .and_then(|o| { if o.status.success() { @@ -2782,13 +2789,18 @@ impl ProdContainerOrchestrator { } } - fn detect_disk_gb() -> u64 { + async fn detect_disk_gb() -> u64 { let target = if Path::new("/var/lib/archipelago").exists() { "/var/lib/archipelago" } else { "/" }; - let output = match Command::new("df").arg("-k").arg(target).output() { + let output = match tokio::process::Command::new("df") + .arg("-k") + .arg(target) + .output() + .await + { Ok(o) if o.status.success() => o, _ => return 0, }; @@ -2808,12 +2820,12 @@ impl ProdContainerOrchestrator { kb / 1_000_000 } - fn disk_gb(&self) -> u64 { + async fn disk_gb(&self) -> u64 { #[cfg(test)] if let Some(disk_gb) = self.test_disk_gb { return disk_gb; } - Self::detect_disk_gb() + Self::detect_disk_gb().await } /// Ensure app-specific secrets exist *before* env resolution. The Bitcoin @@ -2838,14 +2850,14 @@ impl ProdContainerOrchestrator { Ok(()) } - fn resolve_dynamic_env(&self, manifest: &mut AppManifest) -> Result<()> { + async fn resolve_dynamic_env(&self, manifest: &mut AppManifest) -> Result<()> { // Materialise any manifest-declared generated secrets before they're // read below. This is the single chokepoint every install/reconcile // path funnels through, so an app's secrets exist by the time its // `secret_env` resolves — no per-app code, no host provisioning. crate::container::secrets::ensure_generated_secrets(&self.secrets_dir, manifest)?; - let mut facts = self.detect_host_facts(); + let mut facts = self.detect_host_facts().await; // Only pay the podman cost to detect Knots-vs-Core when this manifest // actually templates the Bitcoin node into its env (mempool — B12). if manifest @@ -2855,7 +2867,7 @@ impl ProdContainerOrchestrator { .iter() .any(|e| e.template.contains("{{BITCOIN_HOST}}")) { - facts.bitcoin_host = self.bitcoin_host(); + facts.bitcoin_host = self.bitcoin_host().await; } let mut env = manifest.app.environment.clone(); env.extend(manifest.app.container.resolve_derived_env(&facts)); @@ -3208,7 +3220,7 @@ impl ProdContainerOrchestrator { ) -> Result { let mut out = content.to_string(); if out.contains("{{HOST_IP}}") || out.contains("{{HOST_MDNS}}") { - let facts = self.detect_host_facts(); + let facts = self.detect_host_facts().await; out = out .replace("{{HOST_IP}}", &facts.host_ip) .replace("{{HOST_MDNS}}", &facts.host_mdns); @@ -3297,7 +3309,7 @@ impl ProdContainerOrchestrator { /// however the box is reached locally. (Generalised from the old per-app /// netbird TLS helper, deleted in #20 ph4: rsa:2048, 10-year, no per-app Rust.) async fn ensure_manifest_certs(&self, manifest: &AppManifest) -> Result<()> { - let facts = self.detect_host_facts(); + let facts = self.detect_host_facts().await; let render = |s: &str| { s.replace("{{HOST_IP}}", &facts.host_ip) .replace("{{HOST_MDNS}}", &facts.host_mdns) @@ -3594,7 +3606,7 @@ impl ContainerOrchestrator for ProdContainerOrchestrator { self.ensure_app_secrets(app_id).await?; let name = compute_container_name(&lm.manifest); let mut resolved_manifest = lm.manifest.clone(); - self.resolve_dynamic_env(&mut resolved_manifest)?; + self.resolve_dynamic_env(&mut resolved_manifest).await?; let service = format!("{name}.service"); if self.quadlet_unit_exists(&name).await? { @@ -4328,7 +4340,7 @@ app: orch.set_bitcoin_host_for_test(node); let mut manifest = AppManifest::parse(yaml).unwrap(); - orch.resolve_dynamic_env(&mut manifest).unwrap(); + orch.resolve_dynamic_env(&mut manifest).await.unwrap(); assert!( manifest diff --git a/core/archipelago/src/transport/fips.rs b/core/archipelago/src/transport/fips.rs index 15a28620..a922969f 100644 --- a/core/archipelago/src/transport/fips.rs +++ b/core/archipelago/src/transport/fips.rs @@ -23,26 +23,36 @@ use std::time::{Duration, SystemTime, UNIX_EPOCH}; /// TTL keeps the result responsive to daemon flaps without pounding DBus. const AVAILABILITY_CACHE_TTL: Duration = Duration::from_secs(10); +/// Availability cache shared with the background probe thread, so the +/// sync `is_available()` hot path never blocks on `systemctl`. +struct AvailabilityCache { + available: AtomicBool, + probed_at_ms: AtomicU64, + probe_in_flight: AtomicBool, +} + pub struct FipsTransport { identity_dir: PathBuf, - available_cached: AtomicBool, - available_cached_at_ms: AtomicU64, + availability: std::sync::Arc, } impl FipsTransport { pub fn new(identity_dir: &Path) -> Self { Self { identity_dir: identity_dir.to_path_buf(), - available_cached: AtomicBool::new(false), - available_cached_at_ms: AtomicU64::new(0), + availability: std::sync::Arc::new(AvailabilityCache { + available: AtomicBool::new(false), + probed_at_ms: AtomicU64::new(0), + probe_in_flight: AtomicBool::new(false), + }), } } fn probe_daemon_active() -> bool { - // Cheap blocking probe: spawn `systemctl is-active` synchronously. - // Short-circuit if either the archipelago-managed unit or the - // upstream fips.service is active — legacy/dev nodes run only the - // upstream unit. + // Blocking probe — only ever run on a dedicated background thread + // (see is_available), never on a tokio worker. Short-circuit if + // either the archipelago-managed unit or the upstream fips.service + // is active — legacy/dev nodes run only the upstream unit. for unit in [ crate::fips::SERVICE_UNIT, crate::fips::UPSTREAM_SERVICE_UNIT, @@ -70,14 +80,30 @@ impl NodeTransport for FipsTransport { .duration_since(UNIX_EPOCH) .map(|d| d.as_millis() as u64) .unwrap_or(0); - let cached_at = self.available_cached_at_ms.load(Ordering::Relaxed); + let cached_at = self.availability.probed_at_ms.load(Ordering::Relaxed); + let cached = self.availability.available.load(Ordering::Relaxed); if now_ms.saturating_sub(cached_at) < AVAILABILITY_CACHE_TTL.as_millis() as u64 { - return self.available_cached.load(Ordering::Relaxed); + return cached; } - let val = Self::probe_daemon_active(); - self.available_cached.store(val, Ordering::Relaxed); - self.available_cached_at_ms.store(now_ms, Ordering::Relaxed); - val + // Cache is stale. This sync trait method is called from async + // route(), so running the ~50ms systemctl probe inline stalls a + // tokio worker. Serve the stale value and refresh on a background + // thread instead — the transport supervisor's warm loop keeps this + // fresh in steady state, so staleness is bounded to one probe round. + let cache = std::sync::Arc::clone(&self.availability); + if !cache.probe_in_flight.swap(true, Ordering::Relaxed) { + std::thread::spawn(move || { + let val = Self::probe_daemon_active(); + let probed_ms = SystemTime::now() + .duration_since(UNIX_EPOCH) + .map(|d| d.as_millis() as u64) + .unwrap_or(0); + cache.available.store(val, Ordering::Relaxed); + cache.probed_at_ms.store(probed_ms, Ordering::Relaxed); + cache.probe_in_flight.store(false, Ordering::Relaxed); + }); + } + cached } fn send<'a>( diff --git a/core/container/src/runtime.rs b/core/container/src/runtime.rs index cf0dd0f6..4a7cf6f2 100644 --- a/core/container/src/runtime.rs +++ b/core/container/src/runtime.rs @@ -853,11 +853,11 @@ pub struct AutoRuntime { impl AutoRuntime { pub async fn new(user: String) -> Result { // Try Podman first - if Self::check_podman_available() { + if Self::check_podman_available().await { Ok(Self { runtime: Box::new(PodmanRuntime::new(user)), }) - } else if Self::check_docker_available() { + } else if Self::check_docker_available().await { Ok(Self { runtime: Box::new(DockerRuntime::new(user)), }) @@ -866,12 +866,20 @@ impl AutoRuntime { } } - fn check_podman_available() -> bool { - Command::new("podman").arg("--version").output().is_ok() + async fn check_podman_available() -> bool { + TokioCommand::new("podman") + .arg("--version") + .output() + .await + .is_ok() } - fn check_docker_available() -> bool { - Command::new("docker").arg("--version").output().is_ok() + async fn check_docker_available() -> bool { + TokioCommand::new("docker") + .arg("--version") + .output() + .await + .is_ok() } } diff --git a/docs/1.8.0-RELEASE-HARDENING-PLAN.md b/docs/1.8.0-RELEASE-HARDENING-PLAN.md index 68bdc07f..e3715ad1 100644 --- a/docs/1.8.0-RELEASE-HARDENING-PLAN.md +++ b/docs/1.8.0-RELEASE-HARDENING-PLAN.md @@ -114,11 +114,16 @@ modules; production request/boot paths are essentially panic-free. The real risk sweep (`scheduler.rs`), block-header cache (`mesh/mod.rs`), 7× peer-transport badge (`sync.rs` + `content.rs`). Federation tombstone/untombstone upgraded to hard errors (see §I). Install-log line write left fire-and-forget with an explanatory comment. -- [ ] 🟠 **Remove blocking `std::process::Command` from async handlers.** - `install.rs:2222` `published_host_port` (sync podman on the install path), - `dependencies.rs:316` (`df`), `system/handlers.rs:578` (`sudo`), `transport/fips.rs:50` - (`systemctl`) stall tokio workers under load. Convert to `tokio::process` or - `spawn_blocking`. Only 8 files use `std::process::Command` — bounded. +- [x] 🟠 **Remove blocking `std::process::Command` from async handlers.** DONE 2026-07-03: + converted to `tokio::process` — `published_host_port` (install), `detect_disk_gb` + (dependencies), factory-reset restart (system/handlers), `config.rs detect_host_ip`, + the orchestrator host-facts helpers (`detect_host_ip/mdns/disk_gb`, `bitcoin_host`, + `resolve_dynamic_env` now async through all 6 call sites), and `AutoRuntime::new` + probes. `transport/fips.rs is_available()` (sync trait method on the async route path) + now serves the cached value and refreshes via a background thread (stale-while- + revalidate) instead of blocking on systemctl. `image_verifier.rs` cosign sites have no + callers yet — handled with the §A cosign item. Tests: container 155 / transport 29 / + config 29 / package 46 all green. - [ ] 🟡 **Restrict Bitcoin RPC exposure.** `bootstrap.rs:409` writes `rpcallowip=0.0.0.0/0`. Scope to the container subnet / `127.0.0.1`. - [ ] 🟡 **Move generated secrets from env to file mounts.** `manifest.rs:1208-1226` @@ -159,9 +164,11 @@ The real issues are the app-bridge origin model and a bloated bundle. (precached by the service worker → blocks PWA install), plus ~18 MB of ~1 MB full-screen JPEGs. Convert backgrounds to WebP/AVIF at responsive sizes, lazy/stream the intro video, and exclude video/audio from the Workbox precache. Biggest, easiest perf win. -- [ ] 🟢 **DOMPurify the `Server.vue` QR SVG** (`:283/:295` render `v-html` unsanitized while - `TwoFactorSection.vue` sanitizes the analogous SVG); guard the unguarded `pollInterval` - (`Mesh.vue:391`); surface silent data-fetch failures (`curatedApps.ts:58/71`). +- [x] 🟢 **DOMPurify the `Server.vue` QR SVG / guard `Mesh.vue` pollInterval / surface + `curatedApps.ts` fetch failures.** DONE 2026-07-03: WireGuard peer QR now sanitized with + the same `USE_PROFILES: {svg}` call as TwoFactorSection; Mesh poll interval guarded + + nulled on unmount; catalog fetch failures log per-URL console.warn incl. the + all-sources-failed fallback. Bundle-verified. ---