archipelago f9e34fd0c6 refactor(install): route orchestrator-managed apps through orchestrator first
Phase 3a of the install path consolidation. Two coupled changes:

1. install.rs handle_package_install: gate the legacy "container exists →
   adopt + return" probe on !orchestrator_managed. Apps the orchestrator
   knows about (bitcoin-knots, bitcoin-core, lnd, electrumx, fedimint,
   filebrowser, btcpay-server stack apps, mempool stack apps, plus the
   companion UIs that just moved to Quadlet) skip the legacy probe and
   fall straight into the orchestrator branch.

   The legacy adopt block was returning success on a bare `podman start`
   exit-0 — even when the process inside the container crashed seconds
   later. That's the .228 "running but unreachable" failure mode. The
   orchestrator's ensure_running honors the manifest's health check and
   pre-start hooks (e.g. re-renders bitcoin-ui's nginx.conf if the RPC
   password rotated), so this is a behavioral upgrade, not just a
   refactor.

2. ProdContainerOrchestrator::install: make idempotent. Previously it
   blindly called install_fresh which would fail on `podman create` if
   the container name already existed. Now it delegates to ensure_running:
     - Container Running + healthy → no-op (refresh hooks, restart if
       config rewritten)
     - Container Stopped/Exited → start (with hook refresh)
     - Container missing → install_fresh
     - Container in wedged state (Created/Paused/Unknown) → force-recreate

   Without this, change #1 would regress every "container already exists"
   case for the 18 orchestrator-managed app IDs. With it, install becomes
   the single source of truth for "make app X be in the desired state."

Tests: 654 passed across the workspace (614 unit + 37 orchestration + 3
rpc), 0 failures. The 20 prod_orchestrator tests cover the install /
ensure_running / reconcile paths the new install delegates through.

Net delta: install.rs grows by ~30 lines (gating wrapper + comments),
prod_orchestrator.rs grows by ~30 lines (idempotent install body). Both
are temporary — the larger deletions (~1700 lines) come once every app
has been verified through the orchestrator path in subsequent phases.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-01 11:12:52 -04:00

353 lines
13 KiB
Rust

//! Companion UI container lifecycle, entirely Quadlet-managed.
//!
//! A "companion" is a small nginx-based container that exposes a
//! browser-friendly UI on top of a headless backend service:
//!
//! | Backend | Companion | Purpose |
//! |------------------|--------------------|--------------------------|
//! | bitcoin-knots | archy-bitcoin-ui | RPC viewer |
//! | bitcoin-core | archy-bitcoin-ui | RPC viewer |
//! | lnd | archy-lnd-ui | wallet/channel UI |
//! | electrumx | archy-electrs-ui | indexer status UI |
//!
//! Lifecycle: `install` writes a Quadlet `.container` unit to
//! `~/.config/containers/systemd/`, daemon-reloads, then starts the
//! generated `.service`. systemd owns supervision from that point on
//! — archipelago can crash, restart, or be uninstalled without
//! touching the companion.
//!
//! This replaces the old `tokio::spawn { podman run }` block in
//! `install.rs` (~165 lines of fire-and-forget shellouts) with a
//! single declarative call.
use anyhow::{Context, Result};
use std::path::PathBuf;
use tokio::fs;
use tokio::process::Command;
use tracing::{info, warn};
use crate::container::quadlet::{self, BindMount, NetworkMode, QuadletUnit};
use archipelago_container::image_uses_insecure_registry;
const COMPANION_REGISTRY: &str = "146.59.87.168:3000/lfg2025";
/// Static description of one companion. The full list per backend
/// app_id lives in `companions_for`.
#[derive(Debug, Clone)]
pub struct CompanionSpec {
/// Container + unit name (e.g. "archy-bitcoin-ui").
pub name: &'static str,
/// Image base name in the lfg2025 registry namespace
/// (e.g. "bitcoin-ui" → "146.59.87.168:3000/lfg2025/bitcoin-ui:latest").
pub image_base: &'static str,
/// Filesystem locations to look for a local Dockerfile (build wins
/// over registry pull). Searched in order; first hit wins.
pub build_dir_candidates: &'static [&'static str],
/// Optional pre-start hook that renders config files referenced
/// by `bind_mounts`. Returns Ok(()) on success; bind-mount must
/// be present at start time or the companion will 502.
pub pre_start: Option<PreStartHook>,
/// Bind mounts. Always read-only — companions don't write to
/// host paths.
pub bind_mounts: &'static [(&'static str, &'static str)],
}
pub type PreStartHook = fn() -> futures_util::future::BoxFuture<'static, Result<()>>;
/// Companions to install when `package_id` lands. Empty for apps
/// without a companion UI.
pub fn companions_for(package_id: &str) -> &'static [CompanionSpec] {
match package_id {
"bitcoin" | "bitcoin-core" | "bitcoin-knots" => BITCOIN_UI,
"lnd" => LND_UI,
"electrumx" | "electrs" | "mempool-electrs" => ELECTRS_UI,
_ => &[],
}
}
const BITCOIN_UI: &[CompanionSpec] = &[CompanionSpec {
name: "archy-bitcoin-ui",
image_base: "bitcoin-ui",
build_dir_candidates: &[
"/opt/archipelago/docker/bitcoin-ui",
"/home/archipelago/archy/docker/bitcoin-ui",
"/home/archipelago/Projects/archy/docker/bitcoin-ui",
],
pre_start: Some(render_bitcoin_ui),
bind_mounts: &[(
"/var/lib/archipelago/bitcoin-ui/nginx.conf",
"/etc/nginx/conf.d/default.conf",
)],
}];
const LND_UI: &[CompanionSpec] = &[CompanionSpec {
name: "archy-lnd-ui",
image_base: "lnd-ui",
build_dir_candidates: &[
"/opt/archipelago/docker/lnd-ui",
"/home/archipelago/archy/docker/lnd-ui",
"/home/archipelago/Projects/archy/docker/lnd-ui",
],
pre_start: None,
bind_mounts: &[],
}];
const ELECTRS_UI: &[CompanionSpec] = &[CompanionSpec {
name: "archy-electrs-ui",
image_base: "electrs-ui",
build_dir_candidates: &[
"/opt/archipelago/docker/electrs-ui",
"/home/archipelago/archy/docker/electrs-ui",
"/home/archipelago/Projects/archy/docker/electrs-ui",
],
pre_start: None,
bind_mounts: &[],
}];
fn render_bitcoin_ui() -> futures_util::future::BoxFuture<'static, Result<()>> {
Box::pin(async {
let paths = crate::container::bitcoin_ui::RenderPaths::default();
crate::container::bitcoin_ui::render(&paths)
.await
.map(|_| ())
.context("render bitcoin-ui nginx.conf")
})
}
/// Provision and start every companion for `package_id`. Each
/// companion is independent — a failure in one is logged but does
/// not abort the others.
pub async fn install_for(package_id: &str) -> Vec<(String, anyhow::Error)> {
let mut failures = Vec::new();
for spec in companions_for(package_id) {
if let Err(e) = install_one(spec).await {
warn!(companion = spec.name, error = %e, "companion install failed");
failures.push((spec.name.to_string(), e));
}
}
failures
}
/// Stop and remove every companion for `package_id`. Best effort:
/// errors are logged but do not abort the sequence.
pub async fn remove_for(package_id: &str) {
let dir = match quadlet::unit_dir().await {
Ok(d) => d,
Err(e) => {
warn!("companion remove: cannot resolve quadlet dir: {e:#}");
return;
}
};
for spec in companions_for(package_id) {
if let Err(e) = quadlet::disable_remove(spec.name, &dir).await {
warn!(companion = spec.name, error = %e, "companion remove failed");
}
}
}
/// Provision one companion: pre-start hook → image present → write
/// quadlet → daemon-reload → start.
pub async fn install_one(spec: &CompanionSpec) -> Result<()> {
if let Some(hook) = spec.pre_start {
hook().await.with_context(|| {
format!(
"pre-start hook failed for {} — companion will not start",
spec.name
)
})?;
}
let image = ensure_image_present(spec).await?;
let unit = build_unit(spec, &image);
let dir = quadlet::unit_dir().await?;
let changed = quadlet::write_if_changed(&unit, &dir).await?;
if changed {
info!(companion = spec.name, "wrote quadlet unit");
quadlet::daemon_reload_user().await?;
}
// Start is idempotent — if already running, systemctl returns 0.
quadlet::enable_now(&unit.service_name()).await?;
info!(companion = spec.name, "companion started");
Ok(())
}
/// Build companion image locally if a Dockerfile exists, otherwise
/// pull from the lfg2025 registry. Returns the image ref the quadlet
/// should reference (`localhost/<base>:latest` for build, registry
/// URL for pull).
async fn ensure_image_present(spec: &CompanionSpec) -> Result<String> {
let local_image = format!("localhost/{}:latest", spec.image_base);
let registry_image = format!("{}/{}:latest", COMPANION_REGISTRY, spec.image_base);
// Prefer local build — companions can carry build-time customizations
// (e.g. nginx.conf templates baked in). Search known candidates.
for dir in spec.build_dir_candidates {
let dockerfile = PathBuf::from(dir).join("Dockerfile");
if fs::try_exists(&dockerfile).await.unwrap_or(false) {
info!(companion = spec.name, "building locally from {dir}");
let out = Command::new("podman")
.args(["build", "--no-cache", "-t", &local_image, dir])
.output()
.await
.context("spawn podman build")?;
if out.status.success() {
return Ok(local_image);
}
warn!(
companion = spec.name,
"local build failed: {}",
String::from_utf8_lossy(&out.stderr).trim()
);
// Fall through to registry pull rather than fail outright.
break;
}
}
// Registry pull. Use insecure flag only for whitelisted hosts.
let mut cmd = Command::new("podman");
cmd.arg("pull");
if image_uses_insecure_registry(&registry_image) {
cmd.arg("--tls-verify=false");
}
cmd.arg(&registry_image);
let out = cmd.output().await.context("spawn podman pull")?;
if !out.status.success() {
anyhow::bail!(
"no local Dockerfile and registry pull failed for {}: {}",
spec.name,
String::from_utf8_lossy(&out.stderr).trim()
);
}
Ok(registry_image)
}
fn build_unit(spec: &CompanionSpec, image: &str) -> QuadletUnit {
QuadletUnit {
name: spec.name.into(),
description: format!("Archipelago companion UI: {}", spec.name),
image: image.into(),
// Companions proxy to localhost — backend is on :5678, bitcoin
// RPC on :8332. Host network is the simplest way to reach them
// without per-app gateway plumbing.
network: NetworkMode::Host,
// Run as root inside the container so nginx can chown its
// worker dirs. Rootless podman maps this to a high host UID,
// so it is unprivileged on the host.
user: Some("0:0".into()),
memory_mb: Some(128),
cap_drop_all: true,
cap_add: vec![
"CHOWN".into(),
"DAC_OVERRIDE".into(),
"NET_BIND_SERVICE".into(),
"SETUID".into(),
"SETGID".into(),
],
bind_mounts: spec
.bind_mounts
.iter()
.map(|(host, container)| BindMount {
host: PathBuf::from(*host),
container: PathBuf::from(*container),
read_only: true,
})
.collect(),
extra_podman_args: vec![],
depends_on: vec![],
}
}
/// Is a user systemd manager reachable? In production archipelago.service
/// inherits XDG_RUNTIME_DIR from systemd; in unit tests / CI sandboxes it
/// is unset, in which case `systemctl --user` would fail and write to
/// HOME would be an unwanted side effect. The reconciler skips its
/// companion stage when this is false.
fn user_systemd_available() -> bool {
std::env::var_os("XDG_RUNTIME_DIR")
.map(|v| !v.is_empty())
.unwrap_or(false)
}
/// Reconcile companion presence: every expected companion for the
/// given installed apps must have its quadlet unit on disk and its
/// service active. Returns a list of (companion, error) for anything
/// that needed correction and failed.
///
/// Called from `boot_reconciler` so a deleted unit file or a stopped
/// service is repaired within one tick. No-ops if the user systemd
/// manager is not reachable (CI / test environments).
pub async fn reconcile(installed_apps: &[String]) -> Vec<(String, anyhow::Error)> {
if !user_systemd_available() {
return Vec::new();
}
let mut failures = Vec::new();
for app_id in installed_apps {
for spec in companions_for(app_id) {
match needs_repair(spec).await {
Ok(false) => {}
Ok(true) => {
info!(
companion = spec.name,
"reconcile: companion not active, repairing"
);
if let Err(e) = install_one(spec).await {
failures.push((spec.name.to_string(), e));
}
}
Err(e) => {
warn!(companion = spec.name, error = %e, "reconcile probe failed");
failures.push((spec.name.to_string(), e));
}
}
}
}
failures
}
/// Does this companion need install_one to be re-run? Returns true if
/// the unit file is missing OR the service is not active.
async fn needs_repair(spec: &CompanionSpec) -> Result<bool> {
let dir = quadlet::unit_dir().await?;
let unit_path = dir.join(format!("{}.container", spec.name));
if !fs::try_exists(&unit_path).await.unwrap_or(false) {
return Ok(true);
}
let svc = format!("{}.service", spec.name);
Ok(!quadlet::is_active(&svc).await)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn companions_for_known_apps_returns_expected_set() {
assert_eq!(companions_for("bitcoin-knots").len(), 1);
assert_eq!(companions_for("bitcoin-core").len(), 1);
assert_eq!(companions_for("bitcoin").len(), 1);
assert_eq!(companions_for("lnd").len(), 1);
assert_eq!(companions_for("electrumx").len(), 1);
assert_eq!(companions_for("electrs").len(), 1);
assert_eq!(companions_for("mempool-electrs").len(), 1);
assert_eq!(companions_for("nextcloud").len(), 0);
assert_eq!(companions_for("not-a-real-app").len(), 0);
}
#[test]
fn build_unit_uses_host_network_and_drops_caps() {
let spec = &BITCOIN_UI[0];
let u = build_unit(spec, "localhost/bitcoin-ui:latest");
assert_eq!(u.name, "archy-bitcoin-ui");
assert!(matches!(u.network, NetworkMode::Host));
assert!(u.cap_drop_all);
assert!(u.cap_add.iter().any(|c| c == "NET_BIND_SERVICE"));
assert_eq!(u.user.as_deref(), Some("0:0"));
assert_eq!(u.memory_mb, Some(128));
assert_eq!(u.bind_mounts.len(), 1);
assert_eq!(
u.bind_mounts[0].container,
PathBuf::from("/etc/nginx/conf.d/default.conf")
);
assert!(u.bind_mounts[0].read_only);
}
}