//! Bootstrap host-side artifacts on every archipelago startup. //! //! The update pipeline swaps the archipelago binary but does not touch //! scripts, systemd units, or nginx configuration — those are installed //! once by the ISO builder. Without this module, changes to //! `container-doctor.sh`, the doctor service/timer, or the nginx config //! never reach boxes installed before the change. //! //! Two things are synced on startup: //! 1. Doctor artifacts (container-doctor.sh + service + timer). //! 2. An nginx `location /api/app-catalog` proxy block — required for //! the App Store catalog proxy to actually reach the backend. //! //! Idempotent: no-ops on boxes that are already in sync. All work is //! best-effort — failures are logged but never abort the backend. use anyhow::{Context, Result}; use std::path::{Path, PathBuf}; use tokio::fs; use tracing::{debug, info, warn}; use crate::update::host_sudo; const DOCTOR_SH: &str = include_str!("../../../scripts/container-doctor.sh"); const DOCTOR_SERVICE: &str = include_str!("../../../image-recipe/configs/archipelago-doctor.service"); const DOCTOR_TIMER: &str = include_str!("../../../image-recipe/configs/archipelago-doctor.timer"); const DOCTOR_SH_PATH: &str = "/home/archipelago/archy/scripts/container-doctor.sh"; const DOCTOR_SERVICE_PATH: &str = "/etc/systemd/system/archipelago-doctor.service"; const DOCTOR_TIMER_PATH: &str = "/etc/systemd/system/archipelago-doctor.timer"; const NGINX_CONF_PATH: &str = "/etc/nginx/sites-available/archipelago"; const RUNTIME_ASSETS_DIR: &str = "/opt/archipelago/web-ui/archipelago-runtime"; /// Inserted into every server block of the nginx config that lacks the /// `/api/app-catalog` proxy. Kept in sync with the canonical block in /// image-recipe/configs/nginx-archipelago.conf. const NGINX_APP_CATALOG_BLOCK: &str = "\n # App Store catalog proxy — backend fetches from configured registries\n # so the browser doesn't hit CORS/CSP. Without this block nginx falls\n # through to the SPA index.html and the frontend gets HTML back instead\n # of JSON.\n location /api/app-catalog {\n proxy_pass http://127.0.0.1:5678;\n proxy_http_version 1.1;\n proxy_set_header Host $host;\n proxy_set_header X-Real-IP $remote_addr;\n proxy_set_header Cookie $http_cookie;\n proxy_connect_timeout 15s;\n proxy_read_timeout 30s;\n proxy_send_timeout 15s;\n error_page 502 503 = @backend_unavailable;\n error_page 504 = @backend_timeout;\n }\n\n"; /// Entry point called from main startup. Never returns an error to the caller — /// failing to bootstrap host artifacts must not prevent the backend from serving. pub async fn ensure_doctor_installed() { match run_service_override_repair().await { Ok(true) => info!("Removed stale Archipelago dev-mode service override"), Ok(false) => debug!("No stale Archipelago dev-mode service override found"), Err(e) => warn!("Service override repair failed (non-fatal): {:#}", e), } match run_runtime_assets().await { Ok(changed) if changed => info!("Runtime assets synchronized from OTA payload"), Ok(_) => debug!("No OTA runtime payload to synchronize"), Err(e) => warn!("Runtime asset bootstrap failed (non-fatal): {:#}", e), } match run().await { Ok(changed) if changed => info!("Doctor artifacts synchronized with binary"), Ok(_) => debug!("Doctor artifacts already in sync"), Err(e) => warn!("Doctor bootstrap failed (non-fatal): {:#}", e), } match run_nginx().await { Ok(true) => info!("Patched nginx config to proxy /api/app-catalog"), Ok(false) => debug!("Nginx already has /api/app-catalog block"), Err(e) => warn!("Nginx bootstrap failed (non-fatal): {:#}", e), } match run_bitcoin_rpc_repair().await { Ok(true) => info!("Repaired Bitcoin RPC bind settings and restarted Bitcoin containers"), Ok(false) => debug!("Bitcoin RPC bind settings already usable"), Err(e) => warn!("Bitcoin RPC repair failed (non-fatal): {:#}", e), } match tighten_secrets_dir().await { Ok(n) if n > 0 => info!(tightened = n, "Tightened mode on secret files"), Ok(_) => debug!("Secrets directory already at expected mode"), Err(e) => warn!("Secrets dir tightening failed (non-fatal): {:#}", e), } // Podman self-heal MUST be the last bootstrap stage. If podman's // runtime state is wedged, the orchestrator's first reconcile tick // (which fires seconds after bootstrap returns) will hang or error // on every container. Cleaning the runroot here gives the rest of // the process a healthy podman to talk to. match heal_podman_state().await { Ok(PodmanHealOutcome::Healthy) => debug!("podman runtime state healthy"), Ok(PodmanHealOutcome::Cleaned) => warn!( "podman runtime state was wedged at startup — cleaned runroot and re-probed (CRITICAL)" ), Err(e) => warn!("podman self-heal failed (non-fatal, will retry next boot): {:#}", e), } } #[derive(Debug, PartialEq, Eq)] enum PodmanHealOutcome { Healthy, Cleaned, } /// Probe `podman info`. If it succeeds the daemon's runtime state is /// fine and we return `Healthy` immediately. If it times out, fails to /// spawn, or returns an "invalid internal status" / "database state" /// error, the runtime state under `$XDG_RUNTIME_DIR/{containers,libpod}` /// is likely wedged. We delete those two dirs and re-probe — podman /// rebuilds runtime state from persistent storage under /// `$HOME/.local/share/containers/storage/`. /// /// `$XDG_RUNTIME_DIR/podman/` is **deliberately not touched**: that's /// where systemd's socket-activated `podman.sock` listener lives. If we /// removed it, every libpod HTTP call from the orchestrator would fail /// with "connection refused" until `systemctl --user restart /// podman.socket` ran — far worse than the wedge we'd be trying to fix. /// /// Why this is safe at startup: /// - We run BEFORE the orchestrator starts its reconcile loop, so no /// archipelago code is currently calling podman. /// - Persistent container metadata lives under /// `~/.local/share/containers/`, which we never touch. /// - `unless-stopped` containers and Quadlet-supervised services are /// parented under user.slice, not archipelago.service, so they keep /// running even while we clean podman's runtime view of them. After /// the cleanup + re-probe podman re-discovers them. /// /// What this does NOT cover: /// - Storage corruption under `~/.local/share/containers/storage/`. /// That requires a destructive `podman system reset`, which we will /// never do automatically — operator must intervene. /// - Networking corruption (netavark cache). Currently `podman info` /// doesn't diagnose that; if cleanup doesn't fix it, the operator /// will see the warning in the journal. async fn heal_podman_state() -> Result { if probe_podman_ok().await { return Ok(PodmanHealOutcome::Healthy); } // Wedged. Clean runtime state and try again. Note: `podman/` is // intentionally absent from this list — see fn docstring. let xdg = std::env::var("XDG_RUNTIME_DIR") .context("XDG_RUNTIME_DIR not set; can't locate podman runtime state to clean")?; for sub in &["containers", "libpod"] { let path = PathBuf::from(&xdg).join(sub); match fs::remove_dir_all(&path).await { Ok(()) => debug!(path = %path.display(), "removed podman runtime state dir"), Err(e) if e.kind() == std::io::ErrorKind::NotFound => {} Err(e) => warn!(path = %path.display(), "remove failed: {}", e), } } if probe_podman_ok().await { Ok(PodmanHealOutcome::Cleaned) } else { Err(anyhow::anyhow!( "podman info still failing after runtime cleanup; storage may be corrupt — operator must intervene" )) } } /// True iff `podman info` returns 0 within 5s. Any timeout, spawn /// failure, or non-zero exit reads as "wedged" and triggers cleanup. async fn probe_podman_ok() -> bool { use std::time::Duration; let probe = tokio::time::timeout( Duration::from_secs(5), tokio::process::Command::new("podman") .arg("info") .arg("--format=json") .output(), ) .await; match probe { Ok(Ok(out)) => out.status.success(), Ok(Err(_)) | Err(_) => false, } } /// Make sure /var/lib/archipelago/secrets/ stays 0700 owned by archipelago, /// and every file inside is 0600. The parent dir mode is the load-bearing /// boundary against host-side reads from other UIDs (rootless container /// escapes get mapped to UID >= 100000 and can't traverse a 0700/uid=1000 /// directory). The per-file 0600 sweep is defense-in-depth in case some /// installer wrote a 0644 file. async fn tighten_secrets_dir() -> Result { let dir = Path::new("/var/lib/archipelago/secrets"); if !dir.exists() { return Ok(0); } use std::os::unix::fs::PermissionsExt; fs::set_permissions(dir, std::fs::Permissions::from_mode(0o700)) .await .with_context(|| format!("chmod 0700 {}", dir.display()))?; let mut entries = fs::read_dir(dir) .await .with_context(|| format!("read_dir {}", dir.display()))?; let mut tightened = 0u32; while let Some(entry) = entries.next_entry().await? { let path = entry.path(); let meta = match entry.metadata().await { Ok(m) => m, Err(_) => continue, }; if !meta.is_file() { continue; } if meta.permissions().mode() & 0o777 != 0o600 { fs::set_permissions(&path, std::fs::Permissions::from_mode(0o600)) .await .with_context(|| format!("chmod 0600 {}", path.display()))?; tightened += 1; } } Ok(tightened) } async fn run_service_override_repair() -> Result { let override_path = Path::new("/etc/systemd/system/archipelago.service.d/override.conf"); let Ok(content) = fs::read_to_string(override_path).await else { return Ok(false); }; if !content.contains("ARCHIPELAGO_DEV_MODE=true") { return Ok(false); } let only_dev_mode_override = content .lines() .map(str::trim) .filter(|line| !line.is_empty() && !line.starts_with('#')) .all(|line| line == "[Service]" || line == "Environment=ARCHIPELAGO_DEV_MODE=true"); if !only_dev_mode_override { warn!( path = %override_path.display(), "Archipelago service override contains ARCHIPELAGO_DEV_MODE=true plus other settings; leaving it untouched" ); return Ok(false); } let path_s = override_path.to_string_lossy().to_string(); let status = host_sudo(&["rm", "-f", &path_s]) .await .with_context(|| format!("remove {}", override_path.display()))?; if !status.success() { anyhow::bail!("remove {} exited with {}", override_path.display(), status); } let _ = host_sudo(&["systemctl", "daemon-reload"]).await; Ok(true) } async fn run_runtime_assets() -> Result { // The v1.7.50 OTA bridge puts scripts/apps/docker assets inside the // frontend tarball because older binaries only know how to apply the // backend binary and frontend archive. Once the new backend starts, it // promotes that payload into /opt so app installs use the matching specs. let runtime_dir = Path::new(RUNTIME_ASSETS_DIR); if !runtime_dir.exists() { return Ok(false); } let mut changed = false; for (relative, dest) in [ ("apps", "/opt/archipelago/apps"), ("scripts", "/opt/archipelago/scripts"), ("docker", "/opt/archipelago/docker"), ] { let src = runtime_dir.join(relative); if src.exists() { replace_dir_from_runtime(&src, dest).await?; if relative == "scripts" { let _ = host_sudo(&[ "find", dest, "-type", "f", "-name", "*.sh", "-exec", "chmod", "755", "{}", "+", ]) .await; let image_versions = format!("{}/image-versions.sh", dest); if Path::new(&image_versions).exists() { let _ = host_sudo(&["cp", &image_versions, "/opt/archipelago/image-versions.sh"]) .await; } } changed = true; } } let configs = runtime_dir.join("image-recipe/configs"); for unit in ["archipelago-doctor.service", "archipelago-doctor.timer"] { let src = configs.join(unit); if src.exists() { let src_s = src.to_string_lossy().to_string(); let dest = format!("/etc/systemd/system/{}", unit); let status = host_sudo(&["install", "-m", "644", &src_s, &dest]) .await .with_context(|| format!("install {}", unit))?; if !status.success() { anyhow::bail!("install {} exited with {}", unit, status); } changed = true; } } if changed { let _ = host_sudo(&["systemctl", "daemon-reload"]).await; let _ = host_sudo(&["systemctl", "enable", "--now", "archipelago-doctor.timer"]).await; } Ok(changed) } async fn replace_dir_from_runtime(src: &Path, dest: &str) -> Result<()> { let tmp = format!("{}.new.{}", dest, chrono::Utc::now().timestamp_millis()); let src_dot = path_dot(src); let mkdir = host_sudo(&["mkdir", "-p", &tmp]) .await .with_context(|| format!("mkdir {}", tmp))?; if !mkdir.success() { anyhow::bail!("mkdir {} exited with {}", tmp, mkdir); } let copy = host_sudo(&["cp", "-a", &src_dot, &tmp]) .await .with_context(|| format!("copy runtime {} -> {}", src.display(), tmp))?; if !copy.success() { let _ = host_sudo(&["rm", "-rf", &tmp]).await; anyhow::bail!("copy runtime {} exited with {}", src.display(), copy); } let _ = host_sudo(&["mkdir", "-p", dest]).await; let cleanup = host_sudo(&[ "find", dest, "-mindepth", "1", "-maxdepth", "1", "-exec", "rm", "-rf", "{}", "+", ]) .await .with_context(|| format!("clean {}", dest))?; if !cleanup.success() { let _ = host_sudo(&["rm", "-rf", &tmp]).await; anyhow::bail!("clean {} exited with {}", dest, cleanup); } let tmp_dot = format!("{}/.", tmp); let promote = host_sudo(&["cp", "-a", &tmp_dot, dest]) .await .with_context(|| format!("promote {} -> {}", tmp, dest))?; let _ = host_sudo(&["rm", "-rf", &tmp]).await; if !promote.success() { anyhow::bail!("promote {} exited with {}", dest, promote); } Ok(()) } fn path_dot(path: &Path) -> String { let mut p = PathBuf::from(path); p.push("."); p.to_string_lossy().to_string() } async fn run_bitcoin_rpc_repair() -> Result { // Older installs can have a container-owned bitcoin.conf with only rpcauth // and printtoconsole. In that state bitcoind is healthy internally, but the // host-network bitcoin-ui proxy to 127.0.0.1:8332 gets connection resets. // Repair it at startup so OTA fixes existing nodes without a manual // uninstall/reinstall. let script = r#" set -eu conf=/var/lib/archipelago/bitcoin/bitcoin.conf [ -f "$conf" ] || exit 0 changed=0 ensure_line() { line="$1" key="${line%%=*}" if ! grep -q "^${key}=" "$conf"; then printf '%s\n' "$line" >> "$conf" changed=1 fi } ensure_line server=1 ensure_line rpcbind=0.0.0.0 ensure_line rpcallowip=0.0.0.0/0 ensure_line rpcport=8332 ensure_line listen=1 [ "$changed" -eq 0 ] && exit 0 exit 2 "#; let status = host_sudo(&["sh", "-lc", script]) .await .context("repair bitcoin.conf RPC bind settings")?; match status.code() { Some(0) => Ok(false), Some(2) => { for name in ["bitcoin-knots", "bitcoin-core", "archy-bitcoin-ui"] { let _ = tokio::process::Command::new("podman") .args(["restart", name]) .status() .await; } Ok(true) } _ => { warn!("Bitcoin RPC repair helper exited with {}", status); Ok(false) } } } async fn run() -> Result { // Dev-box guard: on contributors' laptops `/home/archipelago/archy` is // typically a symlink into the git checkout, and writing through it // would clobber the working tree with whatever the binary happens to // have been compiled from. Production ISO installs materialize a real // directory. let home_archy = Path::new("/home/archipelago/archy"); if fs::symlink_metadata(home_archy) .await .map(|m| m.file_type().is_symlink()) .unwrap_or(false) { debug!("/home/archipelago/archy is a symlink — skipping doctor bootstrap (dev box)"); return Ok(false); } // Skip entirely on machines without the canonical scripts directory — // writing orphan files there just causes confusion. let scripts_dir = Path::new(DOCTOR_SH_PATH) .parent() .context("doctor script path has no parent")?; if !scripts_dir.exists() { debug!( "Scripts dir {} missing — skipping doctor bootstrap", scripts_dir.display() ); return Ok(false); } let mut changed = false; // 1. Script — lives in archipelago's home dir, user-writable. if needs_write(DOCTOR_SH_PATH, DOCTOR_SH).await { fs::write(DOCTOR_SH_PATH, DOCTOR_SH) .await .with_context(|| format!("write {}", DOCTOR_SH_PATH))?; let _ = tokio::process::Command::new("chmod") .args(["+x", DOCTOR_SH_PATH]) .status() .await; info!("Updated {}", DOCTOR_SH_PATH); changed = true; } // 2. Systemd unit files — /etc is restricted; route through host_sudo. let service_changed = write_root_if_needed(DOCTOR_SERVICE_PATH, DOCTOR_SERVICE).await?; let timer_changed = write_root_if_needed(DOCTOR_TIMER_PATH, DOCTOR_TIMER).await?; changed = changed || service_changed || timer_changed; // 3. Reload + enable. Only when we actually touched units, or when the // timer isn't enabled yet (catches fresh upgrades of boxes that predate // the doctor entirely). let timer_enabled = is_timer_enabled().await; if service_changed || timer_changed || !timer_enabled { if let Err(e) = host_sudo(&["systemctl", "daemon-reload"]).await { warn!("daemon-reload failed: {:#}", e); } if let Err(e) = host_sudo(&["systemctl", "enable", "--now", "archipelago-doctor.timer"]).await { warn!("enable archipelago-doctor.timer failed: {:#}", e); } else if !timer_enabled { info!("Enabled archipelago-doctor.timer"); } } Ok(changed) } async fn needs_write(path: &str, expected: &str) -> bool { match fs::read_to_string(path).await { Ok(current) => current != expected, Err(_) => true, } } /// Write content to a root-owned path via `sudo mv` of a user-owned tmp file. /// Returns true if a write happened. async fn write_root_if_needed(path: &str, content: &str) -> Result { if !needs_write(path, content).await { return Ok(false); } let tmp = format!( "/tmp/archipelago-bootstrap-{}-{}.tmp", std::process::id(), Path::new(path) .file_name() .and_then(|n| n.to_str()) .unwrap_or("unit") ); fs::write(&tmp, content) .await .with_context(|| format!("write tmp {}", tmp))?; let status = host_sudo(&["mv", &tmp, path]) .await .with_context(|| format!("sudo mv {} -> {}", tmp, path))?; if !status.success() { let _ = fs::remove_file(&tmp).await; anyhow::bail!("sudo mv to {} exited with {}", path, status); } info!("Updated {}", path); Ok(true) } async fn is_timer_enabled() -> bool { tokio::process::Command::new("systemctl") .args(["is-enabled", "--quiet", "archipelago-doctor.timer"]) .status() .await .map(|s| s.success()) .unwrap_or(false) } /// Patch the nginx site config to add a `/api/app-catalog` proxy block if /// it's missing. The original ISO shipped individual per-endpoint `location` /// blocks and no catch-all `/api/`, so `/api/app-catalog` silently fell /// through to the SPA `index.html` and the frontend got HTML instead of /// JSON. We anchor the insert to the DWN comment that already sits right /// after the `/api/blob` block, so the new block lands in both the HTTP /// and HTTPS server blocks. /// /// Validates via `nginx -t` before reloading. On failure the patch is /// rolled back from a backup written just before the write. async fn run_nginx() -> Result { // Skip on dev symlinks — we don't want to touch `/etc/nginx` on laptops. let home_archy = Path::new("/home/archipelago/archy"); if fs::symlink_metadata(home_archy) .await .map(|m| m.file_type().is_symlink()) .unwrap_or(false) { return Ok(false); } if !Path::new(NGINX_CONF_PATH).exists() { debug!("{} missing — skipping nginx bootstrap", NGINX_CONF_PATH); return Ok(false); } let content = fs::read_to_string(NGINX_CONF_PATH) .await .with_context(|| format!("read {}", NGINX_CONF_PATH))?; if content.contains("location /api/app-catalog") { return Ok(false); } // The DWN comment sits at the same indent right after the `/api/blob` // block in both server blocks — a stable anchor that existed on every // ISO shipped to date. If it's absent (config got heavily customized), // we bail rather than guess where to splice. let anchor = " # DWN endpoints — peer access over Tor (no auth)"; if !content.contains(anchor) { warn!("nginx conf missing DWN anchor — skipping /api/app-catalog patch"); return Ok(false); } let replacement = format!("{}{}", NGINX_APP_CATALOG_BLOCK, anchor); let patched = content.replace(anchor, &replacement); // Write patched config via a user-owned tmp + sudo mv, after stashing // a backup so we can revert if `nginx -t` hates what we produced. let pid = std::process::id(); let tmp = format!("/tmp/archipelago-nginx-{}.conf", pid); fs::write(&tmp, &patched) .await .with_context(|| format!("write {}", tmp))?; let backup = format!("/tmp/archipelago-nginx-backup-{}.conf", pid); if let Err(e) = host_sudo(&["cp", NGINX_CONF_PATH, &backup]).await { let _ = fs::remove_file(&tmp).await; return Err(e.context("backup nginx conf")); } let mv = host_sudo(&["mv", &tmp, NGINX_CONF_PATH]).await; match mv { Ok(s) if s.success() => {} Ok(s) => { let _ = fs::remove_file(&tmp).await; anyhow::bail!("sudo mv nginx conf exited with {}", s); } Err(e) => { let _ = fs::remove_file(&tmp).await; return Err(e.context("mv tmp -> nginx conf")); } } // Validate. let test = host_sudo(&["nginx", "-t"]).await; let valid = matches!(&test, Ok(s) if s.success()); if !valid { warn!("nginx -t failed after patch — reverting"); let _ = host_sudo(&["mv", &backup, NGINX_CONF_PATH]).await; if let Err(e) = test { return Err(e.context("nginx -t")); } anyhow::bail!("nginx config invalid after patch — reverted"); } // Reload nginx so the new block takes effect immediately. Reload (not // restart) keeps in-flight connections alive. if let Err(e) = host_sudo(&["systemctl", "reload", "nginx"]).await { warn!("nginx reload failed (non-fatal): {:#}", e); } let _ = host_sudo(&["rm", "-f", &backup]).await; Ok(true) }