archipelago 03a4ee1b30 feat(container): manifest-declared generated secrets + companion/quadlet hardening
Generated-secrets system: apps declare `generated_secrets` in their manifest
(kinds hex16/hex32/bcrypt); `container::secrets::ensure_generated_secrets`
materialises them 0600/rootless in resolve_dynamic_env — idempotent and
self-healing (recovers wrongly root-owned secrets with no privilege). Replaces
per-app Rust (deletes ensure_fmcd_password). fedimint-clientd/gateway manifests
now declare fmcd-password / fedimint-gateway-hash.

companion.rs: rebuild the auto-built :latest image when its build context changes
(staleness check) so baked-in fixes (e.g. guardian-UI CSS) actually reach nodes.

quadlet.rs: skip PublishPort under Network=host (podman rejects the combo, exit
125) + regression tests.

UI: "Fedimint Guardian" rename, fedimint-clientd/nostr-rs-relay/meshtastic tagged
as Services (headless backends), gateway icon fallback.

Deployed + verified on .228 (generated-secrets fixed fedimint-gateway start;
grafana/strfry orphan crash-loop units removed).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
2026-06-21 05:11:07 -04:00

549 lines
21 KiB
Rust

//! Companion UI container lifecycle, entirely Quadlet-managed.
//!
//! A "companion" is a small nginx-based container that exposes a
//! browser-friendly UI on top of a headless backend service:
//!
//! | Backend | Companion | Purpose |
//! |------------------|--------------------|--------------------------|
//! | bitcoin-knots | archy-bitcoin-ui | RPC viewer |
//! | bitcoin-core | archy-bitcoin-ui | RPC viewer |
//! | lnd | archy-lnd-ui | wallet/channel UI |
//! | electrumx | archy-electrs-ui | indexer status UI |
//! | fedimint | archy-fedimint-ui | wait/proxy Guardian UI |
//!
//! Lifecycle: `install` writes a Quadlet `.container` unit to
//! `~/.config/containers/systemd/`, daemon-reloads, then starts the
//! generated `.service`. systemd owns supervision from that point on
//! — archipelago can crash, restart, or be uninstalled without
//! touching the companion.
//!
//! This replaces the old `tokio::spawn { podman run }` block in
//! `install.rs` (~165 lines of fire-and-forget shellouts) with a
//! single declarative call.
use anyhow::{Context, Result};
use std::path::PathBuf;
use std::time::Duration;
use tokio::fs;
use tokio::process::Command;
use tracing::{info, warn};
use crate::container::quadlet::{self, BindMount, NetworkMode, QuadletUnit};
use archipelago_container::image_uses_insecure_registry;
const COMPANION_REGISTRY: &str = "146.59.87.168:3000/lfg2025";
const COMPANION_IMAGE_CHECK_TIMEOUT: Duration = Duration::from_secs(15);
const COMPANION_BUILD_TIMEOUT: Duration = Duration::from_secs(900);
const COMPANION_PULL_TIMEOUT: Duration = Duration::from_secs(300);
/// Static description of one companion. The full list per backend
/// app_id lives in `companions_for`.
#[derive(Debug, Clone)]
pub struct CompanionSpec {
/// Container + unit name (e.g. "archy-bitcoin-ui").
pub name: &'static str,
/// Image base name in the lfg2025 registry namespace
/// (e.g. "bitcoin-ui" → "146.59.87.168:3000/lfg2025/bitcoin-ui:latest").
pub image_base: &'static str,
/// Filesystem locations to look for a local Dockerfile (build wins
/// over registry pull). Searched in order; first hit wins.
pub build_dir_candidates: &'static [&'static str],
/// Optional pre-start hook that renders config files referenced
/// by `bind_mounts`. Returns Ok(()) on success; bind-mount must
/// be present at start time or the companion will 502.
pub pre_start: Option<PreStartHook>,
/// Bind mounts. Always read-only — companions don't write to
/// host paths.
pub bind_mounts: &'static [(&'static str, &'static str)],
/// Host-to-container TCP ports for non-host-network companions.
pub ports: &'static [(u16, u16)],
/// Whether the companion must share the host network namespace.
pub host_network: bool,
}
pub type PreStartHook = fn() -> futures_util::future::BoxFuture<'static, Result<()>>;
/// Companions to install when `package_id` lands. Empty for apps
/// without a companion UI.
pub fn companions_for(package_id: &str) -> &'static [CompanionSpec] {
match package_id {
"bitcoin" | "bitcoin-core" | "bitcoin-knots" => BITCOIN_UI,
"lnd" => LND_UI,
"electrumx" | "electrs" | "mempool-electrs" => ELECTRS_UI,
"fedimint" | "fedimintd" => FEDIMINT_UI,
_ => &[],
}
}
const BITCOIN_UI: &[CompanionSpec] = &[CompanionSpec {
name: "archy-bitcoin-ui",
image_base: "bitcoin-ui",
build_dir_candidates: &[
"/opt/archipelago/docker/bitcoin-ui",
"/home/archipelago/archy/docker/bitcoin-ui",
"/home/archipelago/Projects/archy/docker/bitcoin-ui",
],
pre_start: Some(render_bitcoin_ui),
bind_mounts: &[(
"/var/lib/archipelago/bitcoin-ui/nginx.conf",
"/etc/nginx/conf.d/default.conf",
)],
ports: &[],
host_network: true,
}];
const LND_UI: &[CompanionSpec] = &[CompanionSpec {
name: "archy-lnd-ui",
image_base: "lnd-ui",
build_dir_candidates: &[
"/opt/archipelago/docker/lnd-ui",
"/home/archipelago/archy/docker/lnd-ui",
"/home/archipelago/Projects/archy/docker/lnd-ui",
],
pre_start: None,
bind_mounts: &[],
// Host networking so the app's own nginx can proxy the archipelago backend
// same-origin (127.0.0.1:5678), exactly like fips-ui / electrs-ui. The
// previous bridge + 18083→80 mapping forced the browser to fetch the
// backend cross-origin from the app's port, which depended on the host
// nginx route + a CORS Origin/Host match and broke on http-only nodes
// (e.g. .116: blank fields, QR "failed to fetch"). The app's nginx now
// listens on 18083 directly (NOT 80 — that would collide with host nginx).
ports: &[],
host_network: true,
}];
const ELECTRS_UI: &[CompanionSpec] = &[CompanionSpec {
name: "archy-electrs-ui",
image_base: "electrs-ui",
build_dir_candidates: &[
"/opt/archipelago/docker/electrs-ui",
"/home/archipelago/archy/docker/electrs-ui",
"/home/archipelago/Projects/archy/docker/electrs-ui",
],
pre_start: None,
bind_mounts: &[],
ports: &[],
host_network: true,
}];
const FEDIMINT_UI: &[CompanionSpec] = &[CompanionSpec {
name: "archy-fedimint-ui",
image_base: "fedimint-ui",
build_dir_candidates: &[
"/opt/archipelago/docker/fedimint-ui",
"/home/archipelago/archy/docker/fedimint-ui",
"/home/archipelago/Projects/archy/docker/fedimint-ui",
],
pre_start: None,
bind_mounts: &[],
ports: &[],
host_network: true,
}];
fn render_bitcoin_ui() -> futures_util::future::BoxFuture<'static, Result<()>> {
Box::pin(async {
let paths = crate::container::bitcoin_ui::RenderPaths::default();
crate::container::bitcoin_ui::render(&paths)
.await
.map(|_| ())
.context("render bitcoin-ui nginx.conf")
})
}
/// Provision and start every companion for `package_id`. Each
/// companion is independent — a failure in one is logged but does
/// not abort the others.
pub async fn install_for(package_id: &str) -> Vec<(String, anyhow::Error)> {
let mut failures = Vec::new();
for spec in companions_for(package_id) {
if let Err(e) = install_one(spec).await {
warn!(companion = spec.name, error = %e, "companion install failed");
failures.push((spec.name.to_string(), e));
}
}
failures
}
/// Stop and remove every companion for `package_id`. Best effort:
/// errors are logged but do not abort the sequence.
pub async fn remove_for(package_id: &str) {
let dir = match quadlet::unit_dir().await {
Ok(d) => d,
Err(e) => {
warn!("companion remove: cannot resolve quadlet dir: {e:#}");
return;
}
};
for spec in companions_for(package_id) {
if let Err(e) = quadlet::disable_remove(spec.name, &dir).await {
warn!(companion = spec.name, error = %e, "companion remove failed");
}
}
}
/// Provision one companion: pre-start hook → image present → write
/// quadlet → daemon-reload → start.
pub async fn install_one(spec: &CompanionSpec) -> Result<()> {
if let Some(hook) = spec.pre_start {
hook().await.with_context(|| {
format!(
"pre-start hook failed for {} — companion will not start",
spec.name
)
})?;
}
let image = ensure_image_present(spec).await?;
let unit = build_unit(spec, &image);
let dir = quadlet::unit_dir().await?;
let changed = quadlet::write_if_changed(&unit, &dir).await?;
if changed {
info!(companion = spec.name, "wrote quadlet unit");
quadlet::daemon_reload_user().await?;
}
// Start is idempotent — if already running, systemctl returns 0.
quadlet::enable_now(&unit.service_name()).await?;
info!(companion = spec.name, "companion started");
Ok(())
}
/// Build companion image locally if a Dockerfile exists, otherwise
/// pull from the lfg2025 registry. Returns the image ref the quadlet
/// should reference (`localhost/<base>:latest` for build, registry
/// URL for pull).
async fn ensure_image_present(spec: &CompanionSpec) -> Result<String> {
let local_image = format!("localhost/{}:latest", spec.image_base);
let local_image_compat = format!("localhost/{}:local", spec.image_base);
let registry_image = format!("{}/{}:latest", COMPANION_REGISTRY, spec.image_base);
// Prefer local build — companions can carry build-time customizations
// (e.g. nginx.conf templates baked in). Search known candidates.
for dir in spec.build_dir_candidates {
let dockerfile = PathBuf::from(dir).join("Dockerfile");
if fs::try_exists(&dockerfile).await.unwrap_or(false) {
// `:local` is a deliberate manual override — never auto-rebuild it.
if image_exists(&local_image_compat).await {
return Ok(local_image_compat);
}
// Reuse the auto-built `:latest` only when the build context has NOT
// changed since it was built. Without this staleness check an
// already-present image is reused forever, so edits to the baked-in
// context (Dockerfile, nginx.conf, …) never reach the node — this is
// exactly why the guardian-CSS nginx fix never reached the fleet.
if image_exists(&local_image).await {
if !context_is_newer_than_image(dir, &local_image).await {
return Ok(local_image);
}
info!(
companion = spec.name,
"build context changed since image built; rebuilding {dir}"
);
} else {
info!(companion = spec.name, "building locally from {dir}");
}
let out = command_output_with_timeout(
Command::new("podman").args(["build", "-t", &local_image, dir]),
COMPANION_BUILD_TIMEOUT,
"podman build companion image",
)
.await?;
if out.status.success() {
return Ok(local_image);
}
warn!(
companion = spec.name,
"local build failed: {}",
String::from_utf8_lossy(&out.stderr).trim()
);
// Fall through to registry pull rather than fail outright.
break;
}
}
// Registry pull. Use insecure flag only for whitelisted hosts.
let mut cmd = Command::new("podman");
cmd.arg("pull");
if image_uses_insecure_registry(&registry_image) {
cmd.arg("--tls-verify=false");
}
cmd.arg(&registry_image);
let out = command_output_with_timeout(
&mut cmd,
COMPANION_PULL_TIMEOUT,
"podman pull companion image",
)
.await?;
if !out.status.success() {
anyhow::bail!(
"no local Dockerfile and registry pull failed for {}: {}",
spec.name,
String::from_utf8_lossy(&out.stderr).trim()
);
}
Ok(registry_image)
}
async fn image_exists(image: &str) -> bool {
let mut cmd = Command::new("podman");
cmd.args(["image", "inspect", image]);
match tokio::time::timeout(COMPANION_IMAGE_CHECK_TIMEOUT, cmd.status()).await {
Ok(Ok(status)) => status.success(),
Ok(Err(err)) => {
warn!(image = %image, error = %err, "companion image existence check failed");
false
}
Err(_) => {
warn!(image = %image, "companion image existence check timed out");
false
}
}
}
/// Returns true if any file in the build context `dir` is newer than the
/// already-built `image`, signalling the cached image is stale and must be
/// rebuilt. Conservative: if either timestamp can't be determined we return
/// false (reuse the cache) to avoid rebuild storms on every reconcile pass.
async fn context_is_newer_than_image(dir: &str, image: &str) -> bool {
let image_created = match image_created_unix(image).await {
Some(t) => t,
None => return false,
};
match newest_mtime_unix(PathBuf::from(dir)).await {
Some(ctx) => ctx > image_created,
None => false,
}
}
/// Build timestamp of `image` as Unix seconds, via `podman image inspect`.
async fn image_created_unix(image: &str) -> Option<i64> {
let mut cmd = Command::new("podman");
cmd.args(["image", "inspect", "--format", "{{.Created.Unix}}", image]);
let out = command_output_with_timeout(
&mut cmd,
COMPANION_IMAGE_CHECK_TIMEOUT,
"podman image created time",
)
.await
.ok()?;
if !out.status.success() {
return None;
}
String::from_utf8_lossy(&out.stdout).trim().parse::<i64>().ok()
}
/// Newest modification time (Unix seconds) across all files under `dir`,
/// walked recursively. Runs on a blocking thread since it touches the fs.
async fn newest_mtime_unix(dir: PathBuf) -> Option<i64> {
tokio::task::spawn_blocking(move || newest_mtime_blocking(&dir))
.await
.ok()
.flatten()
}
fn newest_mtime_blocking(dir: &std::path::Path) -> Option<i64> {
let mut newest: Option<i64> = None;
let mut stack = vec![dir.to_path_buf()];
while let Some(p) = stack.pop() {
let entries = match std::fs::read_dir(&p) {
Ok(e) => e,
Err(_) => continue,
};
for entry in entries.flatten() {
let meta = match entry.metadata() {
Ok(m) => m,
Err(_) => continue,
};
if meta.is_dir() {
stack.push(entry.path());
} else if let Ok(modified) = meta.modified() {
if let Ok(dur) = modified.duration_since(std::time::UNIX_EPOCH) {
let secs = dur.as_secs() as i64;
newest = Some(newest.map_or(secs, |n| n.max(secs)));
}
}
}
}
newest
}
async fn command_output_with_timeout(
cmd: &mut Command,
timeout: Duration,
description: &str,
) -> Result<std::process::Output> {
cmd.kill_on_drop(true);
tokio::time::timeout(timeout, cmd.output())
.await
.with_context(|| format!("{description} timed out after {}s", timeout.as_secs()))?
.with_context(|| format!("spawn {description}"))
}
fn build_unit(spec: &CompanionSpec, image: &str) -> QuadletUnit {
QuadletUnit {
name: spec.name.into(),
description: format!("Archipelago companion UI: {}", spec.name),
image: image.into(),
network: if spec.host_network {
NetworkMode::Host
} else {
NetworkMode::Bridge("bridge".into())
},
// Run as root inside the container so nginx can chown its
// worker dirs. Rootless podman maps this to a high host UID,
// so it is unprivileged on the host.
user: Some("0:0".into()),
memory_mb: Some(128),
cap_drop_all: true,
cap_add: vec![
"CHOWN".into(),
"DAC_OVERRIDE".into(),
"NET_BIND_SERVICE".into(),
"SETUID".into(),
"SETGID".into(),
],
bind_mounts: spec
.bind_mounts
.iter()
.map(|(host, container)| BindMount {
host: PathBuf::from(*host),
container: PathBuf::from(*container),
read_only: true,
})
.collect(),
ports: spec
.ports
.iter()
.map(|(host, container)| (*host, *container, "tcp".into()))
.collect(),
extra_podman_args: vec![],
depends_on: vec![],
// Companions don't use the backend-manifest extension fields;
// the renderer skips empty/false directives so the rendered
// bytes are unchanged from before quadlet.rs grew the new fields.
..QuadletUnit::default()
}
}
/// Is a user systemd manager reachable? In production archipelago.service
/// inherits XDG_RUNTIME_DIR from systemd; in unit tests / CI sandboxes it
/// is unset, in which case `systemctl --user` would fail and write to
/// HOME would be an unwanted side effect. The reconciler skips its
/// companion stage when this is false.
fn user_systemd_available() -> bool {
std::env::var_os("XDG_RUNTIME_DIR")
.map(|v| !v.is_empty())
.unwrap_or(false)
}
/// Reconcile companion presence: every expected companion for the
/// given installed apps must have its quadlet unit on disk and its
/// service active. Returns a list of (companion, error) for anything
/// that needed correction and failed.
///
/// Called from `boot_reconciler` so a deleted unit file or a stopped
/// service is repaired within one tick. No-ops if the user systemd
/// manager is not reachable (CI / test environments).
pub async fn reconcile(installed_apps: &[String]) -> Vec<(String, anyhow::Error)> {
if !user_systemd_available() {
return Vec::new();
}
let mut failures = Vec::new();
for app_id in installed_apps {
for spec in companions_for(app_id) {
match needs_repair(spec).await {
Ok(false) => {}
Ok(true) => {
info!(
companion = spec.name,
"reconcile: companion not active, repairing"
);
if let Err(e) = install_one(spec).await {
failures.push((spec.name.to_string(), e));
}
}
Err(e) => {
warn!(companion = spec.name, error = %e, "reconcile probe failed");
failures.push((spec.name.to_string(), e));
}
}
}
}
failures
}
/// Does this companion need install_one to be re-run? Returns true if
/// the unit file is missing, stale, or the service is not active.
async fn needs_repair(spec: &CompanionSpec) -> Result<bool> {
let dir = quadlet::unit_dir().await?;
let unit_path = dir.join(format!("{}.container", spec.name));
if !fs::try_exists(&unit_path).await.unwrap_or(false) {
return Ok(true);
}
let expected_image = ensure_image_present(spec).await?;
let expected_unit = build_unit(spec, &expected_image);
if expected_unit.render() != fs::read_to_string(&unit_path).await.unwrap_or_default() {
return Ok(true);
}
let svc = format!("{}.service", spec.name);
Ok(!quadlet::is_active(&svc).await)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn companions_for_known_apps_returns_expected_set() {
assert_eq!(companions_for("bitcoin-knots").len(), 1);
assert_eq!(companions_for("bitcoin-core").len(), 1);
assert_eq!(companions_for("bitcoin").len(), 1);
assert_eq!(companions_for("lnd").len(), 1);
assert_eq!(companions_for("electrumx").len(), 1);
assert_eq!(companions_for("electrs").len(), 1);
assert_eq!(companions_for("mempool-electrs").len(), 1);
assert_eq!(companions_for("fedimint").len(), 1);
assert_eq!(companions_for("fedimintd").len(), 1);
assert_eq!(companions_for("nextcloud").len(), 0);
assert_eq!(companions_for("not-a-real-app").len(), 0);
}
#[test]
fn build_unit_uses_host_network_and_drops_caps() {
let spec = &BITCOIN_UI[0];
let u = build_unit(spec, "localhost/bitcoin-ui:latest");
assert_eq!(u.name, "archy-bitcoin-ui");
assert!(matches!(u.network, NetworkMode::Host));
assert!(u.cap_drop_all);
assert!(u.cap_add.iter().any(|c| c == "NET_BIND_SERVICE"));
assert_eq!(u.user.as_deref(), Some("0:0"));
assert_eq!(u.memory_mb, Some(128));
assert_eq!(u.bind_mounts.len(), 1);
assert_eq!(
u.bind_mounts[0].container,
PathBuf::from("/etc/nginx/conf.d/default.conf")
);
assert!(u.bind_mounts[0].read_only);
}
#[test]
fn lnd_ui_uses_host_network_for_same_origin_backend_proxy() {
// lnd-ui is host-networked (its nginx listens on 18083 directly) so the
// app can proxy the archipelago backend same-origin instead of fetching
// it cross-origin from its app port — see the spec comment for why.
let spec = &LND_UI[0];
let u = build_unit(spec, "localhost/lnd-ui:latest");
assert_eq!(u.name, "archy-lnd-ui");
assert!(matches!(u.network, NetworkMode::Host));
assert!(u.ports.is_empty());
}
#[test]
fn fedimint_ui_uses_host_network_for_public_guardian_port() {
let spec = &FEDIMINT_UI[0];
let u = build_unit(spec, "localhost/fedimint-ui:latest");
assert_eq!(u.name, "archy-fedimint-ui");
assert!(matches!(u.network, NetworkMode::Host));
assert!(u.ports.is_empty());
}
}