//! Bootstrap host-side artifacts on every archipelago startup. //! //! The update pipeline swaps the archipelago binary but does not touch //! scripts, systemd units, or nginx configuration — those are installed //! once by the ISO builder. Without this module, changes to //! `container-doctor.sh`, the doctor service/timer, or the nginx config //! never reach boxes installed before the change. //! //! Two things are synced on startup: //! 1. Doctor artifacts (container-doctor.sh + service + timer). //! 2. Missing nginx backend proxy blocks required for frontend fetches to //! reach the backend instead of the SPA fallback. //! //! Idempotent: no-ops on boxes that are already in sync. All work is //! best-effort — failures are logged but never abort the backend. use anyhow::{Context, Result}; use std::path::{Path, PathBuf}; use tokio::fs; use tracing::{debug, info, warn}; use crate::update::host_sudo; const DOCTOR_SH: &str = include_str!("../../../scripts/container-doctor.sh"); const DOCTOR_SERVICE: &str = include_str!("../../../image-recipe/configs/archipelago-doctor.service"); const DOCTOR_TIMER: &str = include_str!("../../../image-recipe/configs/archipelago-doctor.timer"); const DOCTOR_SH_PATH: &str = "/home/archipelago/archy/scripts/container-doctor.sh"; const DOCTOR_SERVICE_PATH: &str = "/etc/systemd/system/archipelago-doctor.service"; const DOCTOR_TIMER_PATH: &str = "/etc/systemd/system/archipelago-doctor.timer"; const NGINX_CONF_PATH: &str = "/etc/nginx/sites-available/archipelago"; const NGINX_ENABLED_CONF_PATH: &str = "/etc/nginx/sites-enabled/archipelago"; const RUNTIME_ASSETS_DIR: &str = "/opt/archipelago/web-ui/archipelago-runtime"; /// Inserted into every server block of the nginx config that lacks the /// `/api/app-catalog` proxy. Kept in sync with the canonical block in /// image-recipe/configs/nginx-archipelago.conf. const NGINX_APP_CATALOG_BLOCK: &str = "\n # App Store catalog proxy — backend fetches from configured registries\n # so the browser doesn't hit CORS/CSP. Without this block nginx falls\n # through to the SPA index.html and the frontend gets HTML back instead\n # of JSON.\n location /api/app-catalog {\n proxy_pass http://127.0.0.1:5678;\n proxy_http_version 1.1;\n proxy_set_header Host $host;\n proxy_set_header X-Real-IP $remote_addr;\n proxy_set_header Cookie $http_cookie;\n proxy_connect_timeout 15s;\n proxy_read_timeout 30s;\n proxy_send_timeout 15s;\n error_page 502 503 = @backend_unavailable;\n error_page 504 = @backend_timeout;\n }\n\n"; const NGINX_BITCOIN_STATUS_BLOCK: &str = "\n location /bitcoin-status {\n proxy_pass http://127.0.0.1:5678/bitcoin-status;\n proxy_http_version 1.1;\n proxy_set_header Host $host;\n proxy_connect_timeout 10s;\n proxy_read_timeout 10s;\n proxy_send_timeout 5s;\n error_page 502 503 = @backend_unavailable;\n error_page 504 = @backend_timeout;\n }\n"; /// Entry point called from main startup. Never returns an error to the caller — /// failing to bootstrap host artifacts must not prevent the backend from serving. pub async fn ensure_doctor_installed() { match run_service_override_repair().await { Ok(true) => info!("Removed stale Archipelago dev-mode service override"), Ok(false) => debug!("No stale Archipelago dev-mode service override found"), Err(e) => warn!("Service override repair failed (non-fatal): {:#}", e), } match run_runtime_assets().await { Ok(changed) if changed => info!("Runtime assets synchronized from OTA payload"), Ok(_) => debug!("No OTA runtime payload to synchronize"), Err(e) => warn!("Runtime asset bootstrap failed (non-fatal): {:#}", e), } match run().await { Ok(changed) if changed => info!("Doctor artifacts synchronized with binary"), Ok(_) => debug!("Doctor artifacts already in sync"), Err(e) => warn!("Doctor bootstrap failed (non-fatal): {:#}", e), } match run_nginx().await { Ok(true) => info!("Patched nginx config to proxy missing backend endpoints"), Ok(false) => debug!("Nginx backend endpoint proxy blocks already present"), Err(e) => warn!("Nginx bootstrap failed (non-fatal): {:#}", e), } match run_bitcoin_rpc_repair().await { Ok(true) => { info!("Repaired Bitcoin RPC bind settings; running Bitcoin containers left untouched") } Ok(false) => debug!("Bitcoin RPC bind settings already usable"), Err(e) => warn!("Bitcoin RPC repair failed (non-fatal): {:#}", e), } match tighten_secrets_dir().await { Ok(n) if n > 0 => info!(tightened = n, "Tightened mode on secret files"), Ok(_) => debug!("Secrets directory already at expected mode"), Err(e) => warn!("Secrets dir tightening failed (non-fatal): {:#}", e), } // Podman probing MUST be the last bootstrap stage. We used to delete // transient runroot state here when `podman info` failed, but live nodes // can still have rootlessport/conmon processes holding that state. Removing // it automatically makes failures worse: containers lose `.containerenv`, // ports stay bound, and later starts fail. Report the fault instead; repair // must be deliberate/operator-driven. match heal_podman_state().await { Ok(PodmanHealOutcome::Healthy) => debug!("podman runtime state healthy"), Ok(PodmanHealOutcome::Unhealthy) => warn!( "podman runtime state is unhealthy at startup — skipping automatic runroot cleanup" ), Err(e) => warn!( "podman self-heal failed (non-fatal, will retry next boot): {:#}", e ), } } #[derive(Debug, PartialEq, Eq)] enum PodmanHealOutcome { Healthy, Unhealthy, } async fn heal_podman_state() -> Result { if probe_podman_ok().await { return Ok(PodmanHealOutcome::Healthy); } Ok(PodmanHealOutcome::Unhealthy) } /// True iff `podman info` returns 0 within 5s. Any timeout, spawn /// failure, or non-zero exit reads as "wedged" and triggers cleanup. async fn probe_podman_ok() -> bool { use std::time::Duration; let probe = tokio::time::timeout( Duration::from_secs(5), tokio::process::Command::new("podman") .arg("info") .arg("--format=json") .output(), ) .await; match probe { Ok(Ok(out)) => out.status.success(), Ok(Err(_)) | Err(_) => false, } } /// Make sure /var/lib/archipelago/secrets/ stays 0700 owned by archipelago, /// and every file inside is 0600. The parent dir mode is the load-bearing /// boundary against host-side reads from other UIDs (rootless container /// escapes get mapped to UID >= 100000 and can't traverse a 0700/uid=1000 /// directory). The per-file 0600 sweep is defense-in-depth in case some /// installer wrote a 0644 file. async fn tighten_secrets_dir() -> Result { let dir = Path::new("/var/lib/archipelago/secrets"); if !dir.exists() { return Ok(0); } use std::os::unix::fs::PermissionsExt; fs::set_permissions(dir, std::fs::Permissions::from_mode(0o700)) .await .with_context(|| format!("chmod 0700 {}", dir.display()))?; let mut entries = fs::read_dir(dir) .await .with_context(|| format!("read_dir {}", dir.display()))?; let mut tightened = 0u32; while let Some(entry) = entries.next_entry().await? { let path = entry.path(); let meta = match entry.metadata().await { Ok(m) => m, Err(_) => continue, }; if !meta.is_file() { continue; } if meta.permissions().mode() & 0o777 != 0o600 { fs::set_permissions(&path, std::fs::Permissions::from_mode(0o600)) .await .with_context(|| format!("chmod 0600 {}", path.display()))?; tightened += 1; } } Ok(tightened) } async fn run_service_override_repair() -> Result { let override_path = Path::new("/etc/systemd/system/archipelago.service.d/override.conf"); let Ok(content) = fs::read_to_string(override_path).await else { return Ok(false); }; if !content.contains("ARCHIPELAGO_DEV_MODE=true") { return Ok(false); } let only_dev_mode_override = content .lines() .map(str::trim) .filter(|line| !line.is_empty() && !line.starts_with('#')) .all(|line| line == "[Service]" || line == "Environment=ARCHIPELAGO_DEV_MODE=true"); if !only_dev_mode_override { warn!( path = %override_path.display(), "Archipelago service override contains ARCHIPELAGO_DEV_MODE=true plus other settings; leaving it untouched" ); return Ok(false); } let path_s = override_path.to_string_lossy().to_string(); let status = host_sudo(&["rm", "-f", &path_s]) .await .with_context(|| format!("remove {}", override_path.display()))?; if !status.success() { anyhow::bail!("remove {} exited with {}", override_path.display(), status); } let _ = host_sudo(&["systemctl", "daemon-reload"]).await; Ok(true) } async fn run_runtime_assets() -> Result { // The v1.7.50 OTA bridge puts scripts/apps/docker assets inside the // frontend tarball because older binaries only know how to apply the // backend binary and frontend archive. Once the new backend starts, it // promotes that payload into /opt so app installs use the matching specs. let runtime_dir = Path::new(RUNTIME_ASSETS_DIR); if !runtime_dir.exists() { return Ok(false); } let mut changed = false; for (relative, dest) in [ ("apps", "/opt/archipelago/apps"), ("scripts", "/opt/archipelago/scripts"), ("docker", "/opt/archipelago/docker"), ] { let src = runtime_dir.join(relative); if src.exists() { replace_dir_from_runtime(&src, dest).await?; if relative == "scripts" { let _ = host_sudo(&[ "find", dest, "-type", "f", "-name", "*.sh", "-exec", "chmod", "755", "{}", "+", ]) .await; let image_versions = format!("{}/image-versions.sh", dest); if Path::new(&image_versions).exists() { let _ = host_sudo(&["cp", &image_versions, "/opt/archipelago/image-versions.sh"]) .await; } } changed = true; } } let configs = runtime_dir.join("image-recipe/configs"); for unit in ["archipelago-doctor.service", "archipelago-doctor.timer"] { let src = configs.join(unit); if src.exists() { let src_s = src.to_string_lossy().to_string(); let dest = format!("/etc/systemd/system/{}", unit); let status = host_sudo(&["install", "-m", "644", &src_s, &dest]) .await .with_context(|| format!("install {}", unit))?; if !status.success() { anyhow::bail!("install {} exited with {}", unit, status); } changed = true; } } if changed { let _ = host_sudo(&["systemctl", "daemon-reload"]).await; } Ok(changed) } async fn replace_dir_from_runtime(src: &Path, dest: &str) -> Result<()> { let tmp = format!("{}.new.{}", dest, chrono::Utc::now().timestamp_millis()); let src_dot = path_dot(src); let mkdir = host_sudo(&["mkdir", "-p", &tmp]) .await .with_context(|| format!("mkdir {}", tmp))?; if !mkdir.success() { anyhow::bail!("mkdir {} exited with {}", tmp, mkdir); } let copy = host_sudo(&["cp", "-a", &src_dot, &tmp]) .await .with_context(|| format!("copy runtime {} -> {}", src.display(), tmp))?; if !copy.success() { let _ = host_sudo(&["rm", "-rf", &tmp]).await; anyhow::bail!("copy runtime {} exited with {}", src.display(), copy); } let _ = host_sudo(&["mkdir", "-p", dest]).await; let cleanup = host_sudo(&[ "find", dest, "-mindepth", "1", "-maxdepth", "1", "-exec", "rm", "-rf", "{}", "+", ]) .await .with_context(|| format!("clean {}", dest))?; if !cleanup.success() { let _ = host_sudo(&["rm", "-rf", &tmp]).await; anyhow::bail!("clean {} exited with {}", dest, cleanup); } let tmp_dot = format!("{}/.", tmp); let promote = host_sudo(&["cp", "-a", &tmp_dot, dest]) .await .with_context(|| format!("promote {} -> {}", tmp, dest))?; let _ = host_sudo(&["rm", "-rf", &tmp]).await; if !promote.success() { anyhow::bail!("promote {} exited with {}", dest, promote); } Ok(()) } fn path_dot(path: &Path) -> String { let mut p = PathBuf::from(path); p.push("."); p.to_string_lossy().to_string() } async fn run_bitcoin_rpc_repair() -> Result { // Older installs can have a container-owned bitcoin.conf with only rpcauth // and printtoconsole. Repair it at startup so OTA fixes existing nodes // without a manual uninstall/reinstall. Bind/port stay in the container // command line to avoid duplicate RPC endpoint definitions. let script = r#" set -eu conf=/var/lib/archipelago/bitcoin/bitcoin.conf [ -f "$conf" ] || exit 0 changed=0 ensure_line() { line="$1" key="${line%%=*}" if ! grep -q "^${key}=" "$conf"; then printf '%s\n' "$line" >> "$conf" changed=1 fi } ensure_line server=1 ensure_line rpcallowip=0.0.0.0/0 ensure_line listen=1 [ "$changed" -eq 0 ] && exit 0 exit 2 "#; let status = host_sudo(&["sh", "-lc", script]) .await .context("repair bitcoin.conf RPC bind settings")?; match status.code() { Some(0) => Ok(false), // Do not restart Bitcoin from bootstrap. During IBD, an automatic // restart can cost hours of progress. The repaired file is only a // fallback for future starts; current containers keep their command-line // RPC args until an operator or update intentionally restarts them. Some(2) => Ok(true), _ => { warn!("Bitcoin RPC repair helper exited with {}", status); Ok(false) } } } async fn run() -> Result { // Dev-box guard: on contributors' laptops `/home/archipelago/archy` is // typically a symlink into the git checkout, and writing through it // would clobber the working tree with whatever the binary happens to // have been compiled from. Production ISO installs materialize a real // directory. let home_archy = Path::new("/home/archipelago/archy"); if fs::symlink_metadata(home_archy) .await .map(|m| m.file_type().is_symlink()) .unwrap_or(false) { debug!("/home/archipelago/archy is a symlink — skipping doctor bootstrap (dev box)"); return Ok(false); } // Skip entirely on machines without the canonical scripts directory — // writing orphan files there just causes confusion. let scripts_dir = Path::new(DOCTOR_SH_PATH) .parent() .context("doctor script path has no parent")?; if !scripts_dir.exists() { debug!( "Scripts dir {} missing — skipping doctor bootstrap", scripts_dir.display() ); return Ok(false); } let mut changed = false; // 1. Script — lives in archipelago's home dir, user-writable. if needs_write(DOCTOR_SH_PATH, DOCTOR_SH).await { fs::write(DOCTOR_SH_PATH, DOCTOR_SH) .await .with_context(|| format!("write {}", DOCTOR_SH_PATH))?; let _ = tokio::process::Command::new("chmod") .args(["+x", DOCTOR_SH_PATH]) .status() .await; info!("Updated {}", DOCTOR_SH_PATH); changed = true; } // 2. Systemd unit files — /etc is restricted; route through host_sudo. let service_changed = write_root_if_needed(DOCTOR_SERVICE_PATH, DOCTOR_SERVICE).await?; let timer_changed = write_root_if_needed(DOCTOR_TIMER_PATH, DOCTOR_TIMER).await?; changed = changed || service_changed || timer_changed; // 3. Reload if units changed. Do not enable/start the timer here: lifecycle // qualification and explicit app operations need deterministic Podman // ownership, and the doctor can race those flows. Operators can enable it // separately when they want periodic host repair. if service_changed || timer_changed { if let Err(e) = host_sudo(&["systemctl", "daemon-reload"]).await { warn!("daemon-reload failed: {:#}", e); } } Ok(changed) } async fn needs_write(path: &str, expected: &str) -> bool { match fs::read_to_string(path).await { Ok(current) => current != expected, Err(_) => true, } } /// Write content to a root-owned path via `sudo mv` of a user-owned tmp file. /// Returns true if a write happened. async fn write_root_if_needed(path: &str, content: &str) -> Result { if !needs_write(path, content).await { return Ok(false); } let tmp = format!( "/tmp/archipelago-bootstrap-{}-{}.tmp", std::process::id(), Path::new(path) .file_name() .and_then(|n| n.to_str()) .unwrap_or("unit") ); fs::write(&tmp, content) .await .with_context(|| format!("write tmp {}", tmp))?; let status = host_sudo(&["mv", &tmp, path]) .await .with_context(|| format!("sudo mv {} -> {}", tmp, path))?; if !status.success() { let _ = fs::remove_file(&tmp).await; anyhow::bail!("sudo mv to {} exited with {}", path, status); } info!("Updated {}", path); Ok(true) } /// Patch the nginx site config to add missing backend proxy blocks. Older ISO /// configs shipped individual per-endpoint `location` blocks, so missing /// endpoints silently fell through to the SPA `index.html` and the frontend /// got HTML instead of JSON. /// /// Validates via `nginx -t` before reloading. On failure the patch is /// rolled back from a backup written just before the write. async fn run_nginx() -> Result { // Skip on dev symlinks — we don't want to touch `/etc/nginx` on laptops. let home_archy = Path::new("/home/archipelago/archy"); if fs::symlink_metadata(home_archy) .await .map(|m| m.file_type().is_symlink()) .unwrap_or(false) { return Ok(false); } let mut changed = false; let mut patched_paths = Vec::::new(); for path in [NGINX_CONF_PATH, NGINX_ENABLED_CONF_PATH] { let candidate = Path::new(path); if !candidate.exists() { debug!("{} missing — skipping nginx bootstrap", path); continue; } let canonical = fs::canonicalize(candidate) .await .unwrap_or_else(|_| candidate.to_path_buf()); if patched_paths.iter().any(|p| p == &canonical) { continue; } patched_paths.push(canonical); changed |= patch_nginx_conf(path).await?; } Ok(changed) } async fn patch_nginx_conf(path: &str) -> Result { let content = fs::read_to_string(path) .await .with_context(|| format!("read {}", path))?; let missing_app_catalog = !content.contains("location /api/app-catalog"); let missing_bitcoin_status = !content.contains("location /bitcoin-status"); if !missing_app_catalog && !missing_bitcoin_status { return Ok(false); } let mut patched = content.clone(); if missing_bitcoin_status { let anchor = " location /electrs-status {"; if !patched.contains(anchor) { warn!("nginx conf missing electrs-status anchor — skipping /bitcoin-status patch"); } else { let replacement = format!("{}{}", NGINX_BITCOIN_STATUS_BLOCK, anchor); patched = patched.replace(anchor, &replacement); } } if missing_app_catalog { // The DWN comment sits at the same indent right after the `/api/blob` // block in both server blocks — a stable anchor that existed on every // ISO shipped to date. If it's absent (config got heavily customized), // skip rather than guess where to splice. let anchor = " # DWN endpoints — peer access over Tor (no auth)"; if !patched.contains(anchor) { warn!("nginx conf missing DWN anchor — skipping /api/app-catalog patch"); } else { let replacement = format!("{}{}", NGINX_APP_CATALOG_BLOCK, anchor); patched = patched.replace(anchor, &replacement); } } if patched == content { return Ok(false); } // Write patched config via a user-owned tmp + sudo mv, after stashing // a backup outside nginx include dirs so validation cannot load it too. let pid = std::process::id(); let tmp = format!("/tmp/archipelago-nginx-{}.conf", pid); fs::write(&tmp, &patched) .await .with_context(|| format!("write {}", tmp))?; let backup = format!( "/tmp/archipelago-nginx-backup-{}-{}.conf", pid, patched.len() ); if let Err(e) = host_sudo(&["cp", path, &backup]).await { let _ = fs::remove_file(&tmp).await; return Err(e.context("backup nginx conf")); } let mv = host_sudo(&["mv", &tmp, path]).await; match mv { Ok(s) if s.success() => {} Ok(s) => { let _ = fs::remove_file(&tmp).await; anyhow::bail!("sudo mv nginx conf to {} exited with {}", path, s); } Err(e) => { let _ = fs::remove_file(&tmp).await; return Err(e.context("mv tmp -> nginx conf")); } } // Validate. let test = host_sudo(&["nginx", "-t"]).await; let valid = matches!(&test, Ok(s) if s.success()); if !valid { warn!("nginx -t failed after patch — reverting"); let _ = host_sudo(&["mv", &backup, path]).await; if let Err(e) = test { return Err(e.context("nginx -t")); } anyhow::bail!("nginx config invalid after patch — reverted"); } // Reload nginx so the new block takes effect immediately. Reload (not // restart) keeps in-flight connections alive. if let Err(e) = host_sudo(&["systemctl", "reload", "nginx"]).await { warn!("nginx reload failed (non-fatal): {:#}", e); } let _ = host_sudo(&["rm", "-f", &backup]).await; Ok(true) } #[cfg(test)] mod tests { use super::*; #[test] fn podman_heal_outcome_no_longer_has_cleanup_variant() { let outcome = PodmanHealOutcome::Unhealthy; assert_ne!(outcome, PodmanHealOutcome::Healthy); } }