349 lines
13 KiB
Rust
Raw Normal View History

refactor(container): move companion UIs to systemd via Quadlet Companion UI containers (archy-bitcoin-ui, archy-lnd-ui, archy-electrs-ui) used to be launched as fire-and-forget tokio::spawn blocks from install.rs. If archipelago crashed mid-spawn or the container's cgroup was reaped, companions vanished from podman ps -a and only a manual rm/run could bring them back (the .228 incident). Now each companion is rendered as a Quadlet .container unit under ~/.config/containers/systemd/, daemon-reloaded, and started via systemctl --user. systemd owns supervision from that point on: - archipelago can crash, restart, or be uninstalled without touching any companion. - Quadlet's Restart=always + RestartSec=10 handles container exits. - A 30s reconcile tick in boot_reconciler enumerates expected companion units and re-installs any whose unit file or service vanished — defense-in-depth against external tampering. New module layout: - container/quadlet.rs: pure unit renderer + atomic write_if_changed + systemctl helpers (daemon_reload_user / enable_now / disable_remove / is_active). 6 unit tests, no I/O in the renderer. - container/companion.rs: per-app companion specs, install/remove/ reconcile, image presence (build local first, fall back to insecure registry only via image_uses_insecure_registry whitelist). 2 tests. install.rs handle_package_install now ends with a single call to companion::install_for(package_id), replacing 287 lines of spawn-and- hope shellouts plus a ~120-line nginx auth-injector helper that worked around per-node RPC password baking. The helper is gone too — the pre-start hook renders the per-node nginx.conf to /var/lib/archipelago/ bitcoin-ui/nginx.conf and the Quadlet unit bind-mounts it read-only. runtime.rs handle_package_uninstall now disables companions before the container rm loop. Otherwise systemd's Restart=always would respawn each companion within ~10s of removal. Tests: 53 container tests pass, including 6 quadlet renderer tests (host network, bridge network, capability set, atomic write idempotence) and 2 companion specs (per-app companion lookup, build_unit shape). boot_reconciler tests gain a #[cfg(test)] without_companion_stage() flag so the paused-clock fixtures don't race the real systemctl I/O. A bats regression test (companion-survives-archipelago-restart.bats, gated on ARCHY_ALLOW_DESTRUCTIVE=1) asserts the .228 failure mode cannot recur: every installed companion has a unit file, services stay active across systemctl --user restart archipelago, and a deleted unit file is recreated within one reconcile tick. Net delta: +941 / -363, but the +941 is mostly tests (~440 lines) and the new declarative layer; the imperative tokio::spawn block and its nginx-auth helper are gone, removing two failure classes (orphan companions on archipelago crash, and post-start exec races under tightly-confined cgroups) that previously needed manual SSH recovery. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-05-01 10:45:07 -04:00
//! Companion UI container lifecycle, entirely Quadlet-managed.
//!
//! A "companion" is a small nginx-based container that exposes a
//! browser-friendly UI on top of a headless backend service:
//!
//! | Backend | Companion | Purpose |
//! |------------------|--------------------|--------------------------|
//! | bitcoin-knots | archy-bitcoin-ui | RPC viewer |
//! | bitcoin-core | archy-bitcoin-ui | RPC viewer |
//! | lnd | archy-lnd-ui | wallet/channel UI |
//! | electrumx | archy-electrs-ui | indexer status UI |
//!
//! Lifecycle: `install` writes a Quadlet `.container` unit to
//! `~/.config/containers/systemd/`, daemon-reloads, then starts the
//! generated `.service`. systemd owns supervision from that point on
//! — archipelago can crash, restart, or be uninstalled without
//! touching the companion.
//!
//! This replaces the old `tokio::spawn { podman run }` block in
//! `install.rs` (~165 lines of fire-and-forget shellouts) with a
//! single declarative call.
use anyhow::{Context, Result};
use std::path::PathBuf;
use tokio::fs;
use tokio::process::Command;
use tracing::{info, warn};
use crate::container::quadlet::{
self, BindMount, NetworkMode, QuadletUnit,
};
use archipelago_container::image_uses_insecure_registry;
const COMPANION_REGISTRY: &str = "146.59.87.168:3000/lfg2025";
/// Static description of one companion. The full list per backend
/// app_id lives in `companions_for`.
#[derive(Debug, Clone)]
pub struct CompanionSpec {
/// Container + unit name (e.g. "archy-bitcoin-ui").
pub name: &'static str,
/// Image base name in the lfg2025 registry namespace
/// (e.g. "bitcoin-ui" → "146.59.87.168:3000/lfg2025/bitcoin-ui:latest").
pub image_base: &'static str,
/// Filesystem locations to look for a local Dockerfile (build wins
/// over registry pull). Searched in order; first hit wins.
pub build_dir_candidates: &'static [&'static str],
/// Optional pre-start hook that renders config files referenced
/// by `bind_mounts`. Returns Ok(()) on success; bind-mount must
/// be present at start time or the companion will 502.
pub pre_start: Option<PreStartHook>,
/// Bind mounts. Always read-only — companions don't write to
/// host paths.
pub bind_mounts: &'static [(&'static str, &'static str)],
}
pub type PreStartHook = fn() -> futures_util::future::BoxFuture<'static, Result<()>>;
/// Companions to install when `package_id` lands. Empty for apps
/// without a companion UI.
pub fn companions_for(package_id: &str) -> &'static [CompanionSpec] {
match package_id {
"bitcoin" | "bitcoin-core" | "bitcoin-knots" => BITCOIN_UI,
"lnd" => LND_UI,
"electrumx" | "electrs" | "mempool-electrs" => ELECTRS_UI,
_ => &[],
}
}
const BITCOIN_UI: &[CompanionSpec] = &[CompanionSpec {
name: "archy-bitcoin-ui",
image_base: "bitcoin-ui",
build_dir_candidates: &[
"/opt/archipelago/docker/bitcoin-ui",
"/home/archipelago/archy/docker/bitcoin-ui",
"/home/archipelago/Projects/archy/docker/bitcoin-ui",
],
pre_start: Some(render_bitcoin_ui),
bind_mounts: &[(
"/var/lib/archipelago/bitcoin-ui/nginx.conf",
"/etc/nginx/conf.d/default.conf",
)],
}];
const LND_UI: &[CompanionSpec] = &[CompanionSpec {
name: "archy-lnd-ui",
image_base: "lnd-ui",
build_dir_candidates: &[
"/opt/archipelago/docker/lnd-ui",
"/home/archipelago/archy/docker/lnd-ui",
"/home/archipelago/Projects/archy/docker/lnd-ui",
],
pre_start: None,
bind_mounts: &[],
}];
const ELECTRS_UI: &[CompanionSpec] = &[CompanionSpec {
name: "archy-electrs-ui",
image_base: "electrs-ui",
build_dir_candidates: &[
"/opt/archipelago/docker/electrs-ui",
"/home/archipelago/archy/docker/electrs-ui",
"/home/archipelago/Projects/archy/docker/electrs-ui",
],
pre_start: None,
bind_mounts: &[],
}];
fn render_bitcoin_ui() -> futures_util::future::BoxFuture<'static, Result<()>> {
Box::pin(async {
let paths = crate::container::bitcoin_ui::RenderPaths::default();
crate::container::bitcoin_ui::render(&paths)
.await
.map(|_| ())
.context("render bitcoin-ui nginx.conf")
})
}
/// Provision and start every companion for `package_id`. Each
/// companion is independent — a failure in one is logged but does
/// not abort the others.
pub async fn install_for(package_id: &str) -> Vec<(String, anyhow::Error)> {
let mut failures = Vec::new();
for spec in companions_for(package_id) {
if let Err(e) = install_one(spec).await {
warn!(companion = spec.name, error = %e, "companion install failed");
failures.push((spec.name.to_string(), e));
}
}
failures
}
/// Stop and remove every companion for `package_id`. Best effort:
/// errors are logged but do not abort the sequence.
pub async fn remove_for(package_id: &str) {
let dir = match quadlet::unit_dir().await {
Ok(d) => d,
Err(e) => {
warn!("companion remove: cannot resolve quadlet dir: {e:#}");
return;
}
};
for spec in companions_for(package_id) {
if let Err(e) = quadlet::disable_remove(spec.name, &dir).await {
warn!(companion = spec.name, error = %e, "companion remove failed");
}
}
}
/// Provision one companion: pre-start hook → image present → write
/// quadlet → daemon-reload → start.
pub async fn install_one(spec: &CompanionSpec) -> Result<()> {
if let Some(hook) = spec.pre_start {
hook().await.with_context(|| {
format!("pre-start hook failed for {} — companion will not start", spec.name)
})?;
}
let image = ensure_image_present(spec).await?;
let unit = build_unit(spec, &image);
let dir = quadlet::unit_dir().await?;
let changed = quadlet::write_if_changed(&unit, &dir).await?;
if changed {
info!(companion = spec.name, "wrote quadlet unit");
quadlet::daemon_reload_user().await?;
}
// Start is idempotent — if already running, systemctl returns 0.
quadlet::enable_now(&unit.service_name()).await?;
info!(companion = spec.name, "companion started");
Ok(())
}
/// Build companion image locally if a Dockerfile exists, otherwise
/// pull from the lfg2025 registry. Returns the image ref the quadlet
/// should reference (`localhost/<base>:latest` for build, registry
/// URL for pull).
async fn ensure_image_present(spec: &CompanionSpec) -> Result<String> {
let local_image = format!("localhost/{}:latest", spec.image_base);
let registry_image = format!("{}/{}:latest", COMPANION_REGISTRY, spec.image_base);
// Prefer local build — companions can carry build-time customizations
// (e.g. nginx.conf templates baked in). Search known candidates.
for dir in spec.build_dir_candidates {
let dockerfile = PathBuf::from(dir).join("Dockerfile");
if fs::try_exists(&dockerfile).await.unwrap_or(false) {
info!(companion = spec.name, "building locally from {dir}");
let out = Command::new("podman")
.args(["build", "--no-cache", "-t", &local_image, dir])
.output()
.await
.context("spawn podman build")?;
if out.status.success() {
return Ok(local_image);
}
warn!(
companion = spec.name,
"local build failed: {}",
String::from_utf8_lossy(&out.stderr).trim()
);
// Fall through to registry pull rather than fail outright.
break;
}
}
// Registry pull. Use insecure flag only for whitelisted hosts.
let mut cmd = Command::new("podman");
cmd.arg("pull");
if image_uses_insecure_registry(&registry_image) {
cmd.arg("--tls-verify=false");
}
cmd.arg(&registry_image);
let out = cmd.output().await.context("spawn podman pull")?;
if !out.status.success() {
anyhow::bail!(
"no local Dockerfile and registry pull failed for {}: {}",
spec.name,
String::from_utf8_lossy(&out.stderr).trim()
);
}
Ok(registry_image)
}
fn build_unit(spec: &CompanionSpec, image: &str) -> QuadletUnit {
QuadletUnit {
name: spec.name.into(),
description: format!("Archipelago companion UI: {}", spec.name),
image: image.into(),
// Companions proxy to localhost — backend is on :5678, bitcoin
// RPC on :8332. Host network is the simplest way to reach them
// without per-app gateway plumbing.
network: NetworkMode::Host,
// Run as root inside the container so nginx can chown its
// worker dirs. Rootless podman maps this to a high host UID,
// so it is unprivileged on the host.
user: Some("0:0".into()),
memory_mb: Some(128),
cap_drop_all: true,
cap_add: vec![
"CHOWN".into(),
"DAC_OVERRIDE".into(),
"NET_BIND_SERVICE".into(),
"SETUID".into(),
"SETGID".into(),
],
bind_mounts: spec
.bind_mounts
.iter()
.map(|(host, container)| BindMount {
host: PathBuf::from(*host),
container: PathBuf::from(*container),
read_only: true,
})
.collect(),
extra_podman_args: vec![],
depends_on: vec![],
}
}
/// Is a user systemd manager reachable? In production archipelago.service
/// inherits XDG_RUNTIME_DIR from systemd; in unit tests / CI sandboxes it
/// is unset, in which case `systemctl --user` would fail and write to
/// HOME would be an unwanted side effect. The reconciler skips its
/// companion stage when this is false.
fn user_systemd_available() -> bool {
std::env::var_os("XDG_RUNTIME_DIR")
.map(|v| !v.is_empty())
.unwrap_or(false)
}
/// Reconcile companion presence: every expected companion for the
/// given installed apps must have its quadlet unit on disk and its
/// service active. Returns a list of (companion, error) for anything
/// that needed correction and failed.
///
/// Called from `boot_reconciler` so a deleted unit file or a stopped
/// service is repaired within one tick. No-ops if the user systemd
/// manager is not reachable (CI / test environments).
pub async fn reconcile(installed_apps: &[String]) -> Vec<(String, anyhow::Error)> {
if !user_systemd_available() {
return Vec::new();
}
let mut failures = Vec::new();
for app_id in installed_apps {
for spec in companions_for(app_id) {
match needs_repair(spec).await {
Ok(false) => {}
Ok(true) => {
info!(companion = spec.name, "reconcile: companion not active, repairing");
if let Err(e) = install_one(spec).await {
failures.push((spec.name.to_string(), e));
}
}
Err(e) => {
warn!(companion = spec.name, error = %e, "reconcile probe failed");
failures.push((spec.name.to_string(), e));
}
}
}
}
failures
}
/// Does this companion need install_one to be re-run? Returns true if
/// the unit file is missing OR the service is not active.
async fn needs_repair(spec: &CompanionSpec) -> Result<bool> {
let dir = quadlet::unit_dir().await?;
let unit_path = dir.join(format!("{}.container", spec.name));
if !fs::try_exists(&unit_path).await.unwrap_or(false) {
return Ok(true);
}
let svc = format!("{}.service", spec.name);
Ok(!quadlet::is_active(&svc).await)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn companions_for_known_apps_returns_expected_set() {
assert_eq!(companions_for("bitcoin-knots").len(), 1);
assert_eq!(companions_for("bitcoin-core").len(), 1);
assert_eq!(companions_for("bitcoin").len(), 1);
assert_eq!(companions_for("lnd").len(), 1);
assert_eq!(companions_for("electrumx").len(), 1);
assert_eq!(companions_for("electrs").len(), 1);
assert_eq!(companions_for("mempool-electrs").len(), 1);
assert_eq!(companions_for("nextcloud").len(), 0);
assert_eq!(companions_for("not-a-real-app").len(), 0);
}
#[test]
fn build_unit_uses_host_network_and_drops_caps() {
let spec = &BITCOIN_UI[0];
let u = build_unit(spec, "localhost/bitcoin-ui:latest");
assert_eq!(u.name, "archy-bitcoin-ui");
assert!(matches!(u.network, NetworkMode::Host));
assert!(u.cap_drop_all);
assert!(u.cap_add.iter().any(|c| c == "NET_BIND_SERVICE"));
assert_eq!(u.user.as_deref(), Some("0:0"));
assert_eq!(u.memory_mb, Some(128));
assert_eq!(u.bind_mounts.len(), 1);
assert_eq!(
u.bind_mounts[0].container,
PathBuf::from("/etc/nginx/conf.d/default.conf")
);
assert!(u.bind_mounts[0].read_only);
}
}