archy/core/archipelago/src/bootstrap.rs
2026-05-17 18:40:50 -04:00

619 lines
24 KiB
Rust

//! Bootstrap host-side artifacts on every archipelago startup.
//!
//! The update pipeline swaps the archipelago binary but does not touch
//! scripts, systemd units, or nginx configuration — those are installed
//! once by the ISO builder. Without this module, changes to
//! `container-doctor.sh`, the doctor service/timer, or the nginx config
//! never reach boxes installed before the change.
//!
//! Two things are synced on startup:
//! 1. Doctor artifacts (container-doctor.sh + service + timer).
//! 2. Missing nginx backend proxy blocks required for frontend fetches to
//! reach the backend instead of the SPA fallback.
//!
//! Idempotent: no-ops on boxes that are already in sync. All work is
//! best-effort — failures are logged but never abort the backend.
use anyhow::{Context, Result};
use std::path::{Path, PathBuf};
use tokio::fs;
use tracing::{debug, info, warn};
use crate::update::host_sudo;
const DOCTOR_SH: &str = include_str!("../../../scripts/container-doctor.sh");
const DOCTOR_SERVICE: &str =
include_str!("../../../image-recipe/configs/archipelago-doctor.service");
const DOCTOR_TIMER: &str = include_str!("../../../image-recipe/configs/archipelago-doctor.timer");
const DOCTOR_SH_PATH: &str = "/home/archipelago/archy/scripts/container-doctor.sh";
const DOCTOR_SERVICE_PATH: &str = "/etc/systemd/system/archipelago-doctor.service";
const DOCTOR_TIMER_PATH: &str = "/etc/systemd/system/archipelago-doctor.timer";
const NGINX_CONF_PATH: &str = "/etc/nginx/sites-available/archipelago";
const NGINX_ENABLED_CONF_PATH: &str = "/etc/nginx/sites-enabled/archipelago";
const RUNTIME_ASSETS_DIR: &str = "/opt/archipelago/web-ui/archipelago-runtime";
/// Inserted into every server block of the nginx config that lacks the
/// `/api/app-catalog` proxy. Kept in sync with the canonical block in
/// image-recipe/configs/nginx-archipelago.conf.
const NGINX_APP_CATALOG_BLOCK: &str = "\n # App Store catalog proxy — backend fetches from configured registries\n # so the browser doesn't hit CORS/CSP. Without this block nginx falls\n # through to the SPA index.html and the frontend gets HTML back instead\n # of JSON.\n location /api/app-catalog {\n proxy_pass http://127.0.0.1:5678;\n proxy_http_version 1.1;\n proxy_set_header Host $host;\n proxy_set_header X-Real-IP $remote_addr;\n proxy_set_header Cookie $http_cookie;\n proxy_connect_timeout 15s;\n proxy_read_timeout 30s;\n proxy_send_timeout 15s;\n error_page 502 503 = @backend_unavailable;\n error_page 504 = @backend_timeout;\n }\n\n";
const NGINX_BITCOIN_STATUS_BLOCK: &str = "\n location /bitcoin-status {\n proxy_pass http://127.0.0.1:5678/bitcoin-status;\n proxy_http_version 1.1;\n proxy_set_header Host $host;\n proxy_connect_timeout 10s;\n proxy_read_timeout 10s;\n proxy_send_timeout 5s;\n error_page 502 503 = @backend_unavailable;\n error_page 504 = @backend_timeout;\n }\n";
/// Entry point called from main startup. Never returns an error to the caller —
/// failing to bootstrap host artifacts must not prevent the backend from serving.
pub async fn ensure_doctor_installed() {
match run_service_override_repair().await {
Ok(true) => info!("Removed stale Archipelago dev-mode service override"),
Ok(false) => debug!("No stale Archipelago dev-mode service override found"),
Err(e) => warn!("Service override repair failed (non-fatal): {:#}", e),
}
match run_runtime_assets().await {
Ok(changed) if changed => info!("Runtime assets synchronized from OTA payload"),
Ok(_) => debug!("No OTA runtime payload to synchronize"),
Err(e) => warn!("Runtime asset bootstrap failed (non-fatal): {:#}", e),
}
match run().await {
Ok(changed) if changed => info!("Doctor artifacts synchronized with binary"),
Ok(_) => debug!("Doctor artifacts already in sync"),
Err(e) => warn!("Doctor bootstrap failed (non-fatal): {:#}", e),
}
match run_nginx().await {
Ok(true) => info!("Patched nginx config to proxy missing backend endpoints"),
Ok(false) => debug!("Nginx backend endpoint proxy blocks already present"),
Err(e) => warn!("Nginx bootstrap failed (non-fatal): {:#}", e),
}
match run_bitcoin_rpc_repair().await {
Ok(true) => {
info!("Repaired Bitcoin RPC bind settings; running Bitcoin containers left untouched")
}
Ok(false) => debug!("Bitcoin RPC bind settings already usable"),
Err(e) => warn!("Bitcoin RPC repair failed (non-fatal): {:#}", e),
}
match tighten_secrets_dir().await {
Ok(n) if n > 0 => info!(tightened = n, "Tightened mode on secret files"),
Ok(_) => debug!("Secrets directory already at expected mode"),
Err(e) => warn!("Secrets dir tightening failed (non-fatal): {:#}", e),
}
// Podman probing MUST be the last bootstrap stage. We used to delete
// transient runroot state here when `podman info` failed, but live nodes
// can still have rootlessport/conmon processes holding that state. Removing
// it automatically makes failures worse: containers lose `.containerenv`,
// ports stay bound, and later starts fail. Report the fault instead; repair
// must be deliberate/operator-driven.
match heal_podman_state().await {
Ok(PodmanHealOutcome::Healthy) => debug!("podman runtime state healthy"),
Ok(PodmanHealOutcome::Unhealthy) => warn!(
"podman runtime state is unhealthy at startup — skipping automatic runroot cleanup"
),
Err(e) => warn!(
"podman self-heal failed (non-fatal, will retry next boot): {:#}",
e
),
}
}
#[derive(Debug, PartialEq, Eq)]
enum PodmanHealOutcome {
Healthy,
Unhealthy,
}
async fn heal_podman_state() -> Result<PodmanHealOutcome> {
if probe_podman_ok().await {
return Ok(PodmanHealOutcome::Healthy);
}
Ok(PodmanHealOutcome::Unhealthy)
}
/// True iff `podman info` returns 0 within 5s. Any timeout, spawn
/// failure, or non-zero exit reads as "wedged" and triggers cleanup.
async fn probe_podman_ok() -> bool {
use std::time::Duration;
let probe = tokio::time::timeout(
Duration::from_secs(5),
tokio::process::Command::new("podman")
.arg("info")
.arg("--format=json")
.output(),
)
.await;
match probe {
Ok(Ok(out)) => out.status.success(),
Ok(Err(_)) | Err(_) => false,
}
}
/// Make sure /var/lib/archipelago/secrets/ stays 0700 owned by archipelago,
/// and every file inside is 0600. The parent dir mode is the load-bearing
/// boundary against host-side reads from other UIDs (rootless container
/// escapes get mapped to UID >= 100000 and can't traverse a 0700/uid=1000
/// directory). The per-file 0600 sweep is defense-in-depth in case some
/// installer wrote a 0644 file.
async fn tighten_secrets_dir() -> Result<u32> {
let dir = Path::new("/var/lib/archipelago/secrets");
if !dir.exists() {
return Ok(0);
}
use std::os::unix::fs::PermissionsExt;
fs::set_permissions(dir, std::fs::Permissions::from_mode(0o700))
.await
.with_context(|| format!("chmod 0700 {}", dir.display()))?;
let mut entries = fs::read_dir(dir)
.await
.with_context(|| format!("read_dir {}", dir.display()))?;
let mut tightened = 0u32;
while let Some(entry) = entries.next_entry().await? {
let path = entry.path();
let meta = match entry.metadata().await {
Ok(m) => m,
Err(_) => continue,
};
if !meta.is_file() {
continue;
}
if meta.permissions().mode() & 0o777 != 0o600 {
fs::set_permissions(&path, std::fs::Permissions::from_mode(0o600))
.await
.with_context(|| format!("chmod 0600 {}", path.display()))?;
tightened += 1;
}
}
Ok(tightened)
}
async fn run_service_override_repair() -> Result<bool> {
let override_path = Path::new("/etc/systemd/system/archipelago.service.d/override.conf");
let Ok(content) = fs::read_to_string(override_path).await else {
return Ok(false);
};
if !content.contains("ARCHIPELAGO_DEV_MODE=true") {
return Ok(false);
}
let only_dev_mode_override = content
.lines()
.map(str::trim)
.filter(|line| !line.is_empty() && !line.starts_with('#'))
.all(|line| line == "[Service]" || line == "Environment=ARCHIPELAGO_DEV_MODE=true");
if !only_dev_mode_override {
warn!(
path = %override_path.display(),
"Archipelago service override contains ARCHIPELAGO_DEV_MODE=true plus other settings; leaving it untouched"
);
return Ok(false);
}
let path_s = override_path.to_string_lossy().to_string();
let status = host_sudo(&["rm", "-f", &path_s])
.await
.with_context(|| format!("remove {}", override_path.display()))?;
if !status.success() {
anyhow::bail!("remove {} exited with {}", override_path.display(), status);
}
let _ = host_sudo(&["systemctl", "daemon-reload"]).await;
Ok(true)
}
async fn run_runtime_assets() -> Result<bool> {
// The v1.7.50 OTA bridge puts scripts/apps/docker assets inside the
// frontend tarball because older binaries only know how to apply the
// backend binary and frontend archive. Once the new backend starts, it
// promotes that payload into /opt so app installs use the matching specs.
let runtime_dir = Path::new(RUNTIME_ASSETS_DIR);
if !runtime_dir.exists() {
return Ok(false);
}
let mut changed = false;
for (relative, dest) in [
("apps", "/opt/archipelago/apps"),
("scripts", "/opt/archipelago/scripts"),
("docker", "/opt/archipelago/docker"),
] {
let src = runtime_dir.join(relative);
if src.exists() {
replace_dir_from_runtime(&src, dest).await?;
if relative == "scripts" {
let _ = host_sudo(&[
"find", dest, "-type", "f", "-name", "*.sh", "-exec", "chmod", "755", "{}", "+",
])
.await;
let image_versions = format!("{}/image-versions.sh", dest);
if Path::new(&image_versions).exists() {
let _ =
host_sudo(&["cp", &image_versions, "/opt/archipelago/image-versions.sh"])
.await;
}
}
changed = true;
}
}
let configs = runtime_dir.join("image-recipe/configs");
let nginx_src = configs.join("nginx-archipelago.conf");
if nginx_src.exists() {
let src_s = nginx_src.to_string_lossy().to_string();
let status = host_sudo(&[
"install",
"-m",
"644",
&src_s,
"/etc/nginx/sites-available/archipelago",
])
.await
.context("install nginx-archipelago.conf")?;
if !status.success() {
anyhow::bail!("install nginx-archipelago.conf exited with {}", status);
}
changed = true;
}
for unit in ["archipelago-doctor.service", "archipelago-doctor.timer"] {
let src = configs.join(unit);
if src.exists() {
let src_s = src.to_string_lossy().to_string();
let dest = format!("/etc/systemd/system/{}", unit);
let status = host_sudo(&["install", "-m", "644", &src_s, &dest])
.await
.with_context(|| format!("install {}", unit))?;
if !status.success() {
anyhow::bail!("install {} exited with {}", unit, status);
}
changed = true;
}
}
if changed {
let _ = host_sudo(&["systemctl", "daemon-reload"]).await;
if nginx_src.exists() {
match host_sudo(&["nginx", "-t"]).await {
Ok(status) if status.success() => {
let _ = host_sudo(&["systemctl", "reload", "nginx"]).await;
}
Ok(status) => {
tracing::warn!("nginx config test failed after runtime sync: {}", status);
}
Err(e) => {
tracing::warn!("failed to test nginx config after runtime sync: {}", e);
}
}
}
}
Ok(changed)
}
async fn replace_dir_from_runtime(src: &Path, dest: &str) -> Result<()> {
let tmp = format!("{}.new.{}", dest, chrono::Utc::now().timestamp_millis());
let src_dot = path_dot(src);
let mkdir = host_sudo(&["mkdir", "-p", &tmp])
.await
.with_context(|| format!("mkdir {}", tmp))?;
if !mkdir.success() {
anyhow::bail!("mkdir {} exited with {}", tmp, mkdir);
}
let copy = host_sudo(&["cp", "-a", &src_dot, &tmp])
.await
.with_context(|| format!("copy runtime {} -> {}", src.display(), tmp))?;
if !copy.success() {
let _ = host_sudo(&["rm", "-rf", &tmp]).await;
anyhow::bail!("copy runtime {} exited with {}", src.display(), copy);
}
let _ = host_sudo(&["mkdir", "-p", dest]).await;
let cleanup = host_sudo(&[
"find",
dest,
"-mindepth",
"1",
"-maxdepth",
"1",
"-exec",
"rm",
"-rf",
"{}",
"+",
])
.await
.with_context(|| format!("clean {}", dest))?;
if !cleanup.success() {
let _ = host_sudo(&["rm", "-rf", &tmp]).await;
anyhow::bail!("clean {} exited with {}", dest, cleanup);
}
let tmp_dot = format!("{}/.", tmp);
let promote = host_sudo(&["cp", "-a", &tmp_dot, dest])
.await
.with_context(|| format!("promote {} -> {}", tmp, dest))?;
let _ = host_sudo(&["rm", "-rf", &tmp]).await;
if !promote.success() {
anyhow::bail!("promote {} exited with {}", dest, promote);
}
Ok(())
}
fn path_dot(path: &Path) -> String {
let mut p = PathBuf::from(path);
p.push(".");
p.to_string_lossy().to_string()
}
async fn run_bitcoin_rpc_repair() -> Result<bool> {
// Older installs can have a container-owned bitcoin.conf with only rpcauth
// and printtoconsole. Repair it at startup so OTA fixes existing nodes
// without a manual uninstall/reinstall. Bind/port stay in the container
// command line to avoid duplicate RPC endpoint definitions.
let script = r#"
set -eu
conf=/var/lib/archipelago/bitcoin/bitcoin.conf
[ -f "$conf" ] || exit 0
changed=0
ensure_line() {
line="$1"
key="${line%%=*}"
if ! grep -q "^${key}=" "$conf"; then
printf '%s\n' "$line" >> "$conf"
changed=1
fi
}
ensure_line server=1
ensure_line rpcallowip=0.0.0.0/0
ensure_line listen=1
[ "$changed" -eq 0 ] && exit 0
exit 2
"#;
let status = host_sudo(&["sh", "-lc", script])
.await
.context("repair bitcoin.conf RPC bind settings")?;
match status.code() {
Some(0) => Ok(false),
// Do not restart Bitcoin from bootstrap. During IBD, an automatic
// restart can cost hours of progress. The repaired file is only a
// fallback for future starts; current containers keep their command-line
// RPC args until an operator or update intentionally restarts them.
Some(2) => Ok(true),
_ => {
warn!("Bitcoin RPC repair helper exited with {}", status);
Ok(false)
}
}
}
async fn run() -> Result<bool> {
// Dev-box guard: on contributors' laptops `/home/archipelago/archy` is
// typically a symlink into the git checkout, and writing through it
// would clobber the working tree with whatever the binary happens to
// have been compiled from. Production ISO installs materialize a real
// directory.
let home_archy = Path::new("/home/archipelago/archy");
if fs::symlink_metadata(home_archy)
.await
.map(|m| m.file_type().is_symlink())
.unwrap_or(false)
{
debug!("/home/archipelago/archy is a symlink — skipping doctor bootstrap (dev box)");
return Ok(false);
}
// Skip entirely on machines without the canonical scripts directory —
// writing orphan files there just causes confusion.
let scripts_dir = Path::new(DOCTOR_SH_PATH)
.parent()
.context("doctor script path has no parent")?;
if !scripts_dir.exists() {
debug!(
"Scripts dir {} missing — skipping doctor bootstrap",
scripts_dir.display()
);
return Ok(false);
}
let mut changed = false;
// 1. Script — lives in archipelago's home dir, user-writable.
if needs_write(DOCTOR_SH_PATH, DOCTOR_SH).await {
fs::write(DOCTOR_SH_PATH, DOCTOR_SH)
.await
.with_context(|| format!("write {}", DOCTOR_SH_PATH))?;
let _ = tokio::process::Command::new("chmod")
.args(["+x", DOCTOR_SH_PATH])
.status()
.await;
info!("Updated {}", DOCTOR_SH_PATH);
changed = true;
}
// 2. Systemd unit files — /etc is restricted; route through host_sudo.
let service_changed = write_root_if_needed(DOCTOR_SERVICE_PATH, DOCTOR_SERVICE).await?;
let timer_changed = write_root_if_needed(DOCTOR_TIMER_PATH, DOCTOR_TIMER).await?;
changed = changed || service_changed || timer_changed;
// 3. Reload if units changed. Do not enable/start the timer here: lifecycle
// qualification and explicit app operations need deterministic Podman
// ownership, and the doctor can race those flows. Operators can enable it
// separately when they want periodic host repair.
if service_changed || timer_changed {
if let Err(e) = host_sudo(&["systemctl", "daemon-reload"]).await {
warn!("daemon-reload failed: {:#}", e);
}
}
Ok(changed)
}
async fn needs_write(path: &str, expected: &str) -> bool {
match fs::read_to_string(path).await {
Ok(current) => current != expected,
Err(_) => true,
}
}
/// Write content to a root-owned path via `sudo mv` of a user-owned tmp file.
/// Returns true if a write happened.
async fn write_root_if_needed(path: &str, content: &str) -> Result<bool> {
if !needs_write(path, content).await {
return Ok(false);
}
let tmp = format!(
"/tmp/archipelago-bootstrap-{}-{}.tmp",
std::process::id(),
Path::new(path)
.file_name()
.and_then(|n| n.to_str())
.unwrap_or("unit")
);
fs::write(&tmp, content)
.await
.with_context(|| format!("write tmp {}", tmp))?;
let status = host_sudo(&["mv", &tmp, path])
.await
.with_context(|| format!("sudo mv {} -> {}", tmp, path))?;
if !status.success() {
let _ = fs::remove_file(&tmp).await;
anyhow::bail!("sudo mv to {} exited with {}", path, status);
}
info!("Updated {}", path);
Ok(true)
}
/// Patch the nginx site config to add missing backend proxy blocks. Older ISO
/// configs shipped individual per-endpoint `location` blocks, so missing
/// endpoints silently fell through to the SPA `index.html` and the frontend
/// got HTML instead of JSON.
///
/// Validates via `nginx -t` before reloading. On failure the patch is
/// rolled back from a backup written just before the write.
async fn run_nginx() -> Result<bool> {
// Skip on dev symlinks — we don't want to touch `/etc/nginx` on laptops.
let home_archy = Path::new("/home/archipelago/archy");
if fs::symlink_metadata(home_archy)
.await
.map(|m| m.file_type().is_symlink())
.unwrap_or(false)
{
return Ok(false);
}
let mut changed = false;
let mut patched_paths = Vec::<PathBuf>::new();
for path in [NGINX_CONF_PATH, NGINX_ENABLED_CONF_PATH] {
let candidate = Path::new(path);
if !candidate.exists() {
debug!("{} missing — skipping nginx bootstrap", path);
continue;
}
let canonical = fs::canonicalize(candidate)
.await
.unwrap_or_else(|_| candidate.to_path_buf());
if patched_paths.iter().any(|p| p == &canonical) {
continue;
}
patched_paths.push(canonical);
changed |= patch_nginx_conf(path).await?;
}
Ok(changed)
}
async fn patch_nginx_conf(path: &str) -> Result<bool> {
let content = fs::read_to_string(path)
.await
.with_context(|| format!("read {}", path))?;
let missing_app_catalog = !content.contains("location /api/app-catalog");
let missing_bitcoin_status = !content.contains("location /bitcoin-status");
if !missing_app_catalog && !missing_bitcoin_status {
return Ok(false);
}
let mut patched = content.clone();
if missing_bitcoin_status {
let anchor = " location /electrs-status {";
if !patched.contains(anchor) {
warn!("nginx conf missing electrs-status anchor — skipping /bitcoin-status patch");
} else {
let replacement = format!("{}{}", NGINX_BITCOIN_STATUS_BLOCK, anchor);
patched = patched.replace(anchor, &replacement);
}
}
if missing_app_catalog {
// The DWN comment sits at the same indent right after the `/api/blob`
// block in both server blocks — a stable anchor that existed on every
// ISO shipped to date. If it's absent (config got heavily customized),
// skip rather than guess where to splice.
let anchor = " # DWN endpoints — peer access over Tor (no auth)";
if !patched.contains(anchor) {
warn!("nginx conf missing DWN anchor — skipping /api/app-catalog patch");
} else {
let replacement = format!("{}{}", NGINX_APP_CATALOG_BLOCK, anchor);
patched = patched.replace(anchor, &replacement);
}
}
if patched == content {
return Ok(false);
}
// Write patched config via a user-owned tmp + sudo mv, after stashing
// a backup outside nginx include dirs so validation cannot load it too.
let pid = std::process::id();
let tmp = format!("/tmp/archipelago-nginx-{}.conf", pid);
fs::write(&tmp, &patched)
.await
.with_context(|| format!("write {}", tmp))?;
let backup = format!(
"/tmp/archipelago-nginx-backup-{}-{}.conf",
pid,
patched.len()
);
if let Err(e) = host_sudo(&["cp", path, &backup]).await {
let _ = fs::remove_file(&tmp).await;
return Err(e.context("backup nginx conf"));
}
let mv = host_sudo(&["mv", &tmp, path]).await;
match mv {
Ok(s) if s.success() => {}
Ok(s) => {
let _ = fs::remove_file(&tmp).await;
anyhow::bail!("sudo mv nginx conf to {} exited with {}", path, s);
}
Err(e) => {
let _ = fs::remove_file(&tmp).await;
return Err(e.context("mv tmp -> nginx conf"));
}
}
// Validate.
let test = host_sudo(&["nginx", "-t"]).await;
let valid = matches!(&test, Ok(s) if s.success());
if !valid {
warn!("nginx -t failed after patch — reverting");
let _ = host_sudo(&["mv", &backup, path]).await;
if let Err(e) = test {
return Err(e.context("nginx -t"));
}
anyhow::bail!("nginx config invalid after patch — reverted");
}
// Reload nginx so the new block takes effect immediately. Reload (not
// restart) keeps in-flight connections alive.
if let Err(e) = host_sudo(&["systemctl", "reload", "nginx"]).await {
warn!("nginx reload failed (non-fatal): {:#}", e);
}
let _ = host_sudo(&["rm", "-f", &backup]).await;
Ok(true)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn podman_heal_outcome_no_longer_has_cleanup_variant() {
let outcome = PodmanHealOutcome::Unhealthy;
assert_ne!(outcome, PodmanHealOutcome::Healthy);
}
}