From 2ff47f88a79ffeb4fd3b6bc435298990c7e257af Mon Sep 17 00:00:00 2001 From: Dorian Date: Wed, 13 May 2026 22:59:55 -0400 Subject: [PATCH] fix: harden container reconcile and launch behavior --- .dockerignore | 8 + apps/bitcoin-knots/manifest.yml | 2 +- .../src/container/prod_orchestrator.rs | 37 ++++ core/archipelago/src/health_monitor.rs | 196 +++++++++++++++++- .../_archived/build-auto-installer-iso.sh | 7 +- neode-ui/src/views/AppDetails.vue | 5 +- neode-ui/src/views/apps/appsConfig.ts | 5 +- scripts/container-specs.sh | 4 +- scripts/first-boot-containers.sh | 6 +- 9 files changed, 259 insertions(+), 11 deletions(-) diff --git a/.dockerignore b/.dockerignore index 7c300b37..fa217cb7 100644 --- a/.dockerignore +++ b/.dockerignore @@ -7,6 +7,14 @@ # Allow demo assets (AIUI pre-built dist) !demo/ +# Allow backend source for ISO source builds +!core/ +!scripts/ +!image-recipe/ +image-recipe/build/ +image-recipe/results/ +image-recipe/output/ + # Exclude nested node_modules (will npm install in container) neode-ui/node_modules neode-ui/dist diff --git a/apps/bitcoin-knots/manifest.yml b/apps/bitcoin-knots/manifest.yml index 175aded4..0e390f95 100644 --- a/apps/bitcoin-knots/manifest.yml +++ b/apps/bitcoin-knots/manifest.yml @@ -30,7 +30,7 @@ app: RPC_PASS="$(printenv BITCOIN_RPC_PASS)"; DISK_GB_VALUE="$(printenv DISK_GB || true)"; if [ "${DISK_GB_VALUE:-0}" -lt 1000 ]; then - exec "$BITCOIND" -datadir=/home/bitcoin/.bitcoin -noconf -server=1 -prune=550 -rpcallowip=0.0.0.0/0 -rpcbind=0.0.0.0:8332 -listen=1 -bind=0.0.0.0:8333 -dbcache=1024 -par=0 -maxconnections=125 -rpcuser="$RPC_USER" -rpcpassword="$RPC_PASS"; + exec "$BITCOIND" -datadir=/home/bitcoin/.bitcoin -noconf -server=1 -prune=550 -rpcallowip=0.0.0.0/0 -rpcbind=0.0.0.0:8332 -listen=1 -bind=0.0.0.0:8333 -dbcache=2048 -par=0 -maxconnections=125 -rpcuser="$RPC_USER" -rpcpassword="$RPC_PASS"; else exec "$BITCOIND" -datadir=/home/bitcoin/.bitcoin -noconf -server=1 -txindex=1 -rpcallowip=0.0.0.0/0 -rpcbind=0.0.0.0:8332 -listen=1 -bind=0.0.0.0:8333 -dbcache=4096 -par=0 -maxconnections=125 -rpcuser="$RPC_USER" -rpcpassword="$RPC_PASS"; fi diff --git a/core/archipelago/src/container/prod_orchestrator.rs b/core/archipelago/src/container/prod_orchestrator.rs index f1c59a1a..3f109333 100644 --- a/core/archipelago/src/container/prod_orchestrator.rs +++ b/core/archipelago/src/container/prod_orchestrator.rs @@ -62,6 +62,19 @@ fn is_required_baseline_app(app_id: &str) -> bool { ) } +fn is_restart_sensitive_app(app_id: &str) -> bool { + matches!( + app_id, + "bitcoin-knots" + | "bitcoin-core" + | "bitcoin" + | "lnd" + | "btcpay-server" + | "fedimint" + | "fedimint-gateway" + ) +} + fn requires_archival_bitcoin(app_id: &str) -> bool { matches!( app_id, @@ -713,6 +726,17 @@ impl ProdContainerOrchestrator { return Ok(ReconcileAction::Started); } if self.container_env_drifted(&name, &resolved_manifest).await { + if mode == ReconcileMode::ExistingOnly + && is_restart_sensitive_app(&app_id) + { + tracing::info!( + app_id = %app_id, + container = %name, + "container drift detected during boot reconcile; leaving running restart-sensitive app untouched" + ); + self.run_post_start_hooks(&app_id).await?; + return Ok(ReconcileAction::NoOp); + } tracing::info!(app_id = %app_id, container = %name, "container env drift detected — recreating"); let _ = self.runtime.stop_container(&name).await; let _ = self.runtime.remove_container(&name).await; @@ -2252,6 +2276,7 @@ mod tests { runtime, PathBuf::from("/nonexistent-for-tests"), ); + orch.set_data_dir(PathBuf::from("/nonexistent-for-tests")); // Redirect the bitcoin-ui pre-start hook to a test-scoped // tmpdir, seeded with a fake password file. Shared across // every test in this module (OnceLock), so the hook can run @@ -2259,6 +2284,7 @@ mod tests { // this redirection, any test that installs the bitcoin-ui // fixture would try to write under /var/lib/archipelago. orch.set_bitcoin_ui_paths(test_bitcoin_ui_paths()); + orch.set_filebrowser_paths(test_filebrowser_paths()); orch } @@ -2339,6 +2365,17 @@ app: } } + fn test_filebrowser_paths() -> filebrowser::EnsurePaths { + use std::sync::OnceLock; + static DIR: OnceLock = OnceLock::new(); + let dir = DIR.get_or_init(|| tempfile::TempDir::new().expect("test tmpdir")); + filebrowser::EnsurePaths { + srv_root: dir.path().join("filebrowser"), + data_dir: dir.path().join("filebrowser-data"), + config_path: dir.path().join("filebrowser-data/.filebrowser.json"), + } + } + #[tokio::test] async fn install_fresh_pull() { let rt = Arc::new(MockRuntime::default()); diff --git a/core/archipelago/src/health_monitor.rs b/core/archipelago/src/health_monitor.rs index efd1567b..53d3537f 100644 --- a/core/archipelago/src/health_monitor.rs +++ b/core/archipelago/src/health_monitor.rs @@ -8,7 +8,7 @@ use crate::data_model::{Notification, NotificationLevel, PackageState}; use crate::state::StateManager; use crate::webhooks::{self, WebhookEvent}; use serde::{Deserialize, Serialize}; -use std::collections::HashMap; +use std::collections::{HashMap, HashSet}; use std::path::{Path, PathBuf}; use std::sync::Arc; use std::time::Instant; @@ -420,6 +420,9 @@ async fn check_containers() -> Vec { let stdout = String::from_utf8_lossy(&output.stdout); let containers: Vec = serde_json::from_str(&stdout).unwrap_or_default(); + let live_container_ids = live_container_ids(&containers); + cleanup_stale_podman_healthcheck_units(&live_container_ids).await; + // Monitor ALL long-running containers for health — backend services (databases, // nbxplorer, mempool-api) and UI containers need auto-restart too. // Only skip ephemeral containers (build infrastructure, init one-shots). @@ -462,6 +465,154 @@ async fn check_containers() -> Vec { .collect() } +fn live_container_ids(containers: &[serde_json::Value]) -> HashSet { + containers + .iter() + .filter_map(|c| { + c.get("Id") + .or_else(|| c.get("ID")) + .and_then(|v| v.as_str()) + .map(|s| s.to_string()) + }) + .collect() +} + +async fn cleanup_stale_podman_healthcheck_units(live_container_ids: &HashSet) { + if live_container_ids.is_empty() { + return; + } + + let mut units = stale_healthcheck_units_from_systemd(live_container_ids).await; + if units.is_empty() { + return; + } + units.sort(); + units.dedup(); + + let mut cleaned = 0; + for unit in units { + let Some(container_id) = parse_podman_healthcheck_unit(&unit) else { + continue; + }; + let service = format!("{}.service", unit.trim_end_matches(".timer")); + if stop_user_unit(&unit).await { + cleaned += 1; + } + let _ = stop_user_unit(&service).await; + let _ = reset_failed_user_unit(&service).await; + debug!( + "Stopped stale Podman healthcheck unit {} for removed container {}", + unit, container_id + ); + } + + if cleaned > 0 { + info!("Cleaned {} stale Podman healthcheck timer(s)", cleaned); + } +} + +async fn stale_healthcheck_units_from_systemd(live_container_ids: &HashSet) -> Vec { + let mut units = Vec::new(); + for args in [ + ["--user", "list-timers", "--all", "--no-legend", "--no-pager"].as_slice(), + ["--user", "list-units", "--all", "--no-legend", "--no-pager"].as_slice(), + ] { + let output = match tokio::time::timeout( + std::time::Duration::from_secs(20), + tokio::process::Command::new("systemctl") + .args(args.iter().copied()) + .output(), + ) + .await + { + Ok(Ok(output)) if output.status.success() => output, + Ok(Ok(output)) => { + let stderr = String::from_utf8_lossy(&output.stderr); + debug!("systemctl {} failed: {}", args.join(" "), stderr.trim()); + continue; + } + Ok(Err(e)) => { + debug!("Failed to run systemctl {}: {}", args.join(" "), e); + continue; + } + Err(_) => { + debug!("systemctl {} timed out", args.join(" ")); + continue; + } + }; + + let stdout = String::from_utf8_lossy(&output.stdout); + units.extend(stale_healthcheck_units(&stdout, live_container_ids)); + } + units +} + +fn stale_healthcheck_units(output: &str, live_container_ids: &HashSet) -> Vec { + output + .lines() + .flat_map(|line| line.split_whitespace()) + .filter_map(|token| { + let unit = token.trim_start_matches('●'); + let id = parse_podman_healthcheck_unit(unit)?; + (!live_container_ids.contains(id)).then(|| unit.to_string()) + }) + .collect() +} + +fn parse_podman_healthcheck_unit(unit: &str) -> Option<&str> { + let unit = unit + .strip_suffix(".timer") + .or_else(|| unit.strip_suffix(".service"))?; + let (container_id, _suffix) = unit.split_once('-')?; + if container_id.len() == 64 && container_id.bytes().all(|b| b.is_ascii_hexdigit()) { + Some(container_id) + } else { + None + } +} + +async fn stop_user_unit(unit: &str) -> bool { + run_systemctl_user(["stop", unit]).await +} + +async fn reset_failed_user_unit(unit: &str) -> bool { + run_systemctl_user(["reset-failed", unit]).await +} + +async fn run_systemctl_user(args: [&str; N]) -> bool { + let output = match tokio::time::timeout( + std::time::Duration::from_secs(10), + tokio::process::Command::new("systemctl") + .arg("--user") + .args(args.iter().copied()) + .output(), + ) + .await + { + Ok(Ok(output)) => output, + Ok(Err(e)) => { + debug!("Failed to run systemctl --user {}: {}", args.join(" "), e); + return false; + } + Err(_) => { + debug!("systemctl --user {} timed out", args.join(" ")); + return false; + } + }; + + if output.status.success() { + true + } else { + let stderr = String::from_utf8_lossy(&output.stderr); + debug!( + "systemctl --user {} failed: {}", + args.join(" "), + stderr.trim() + ); + false + } +} + fn parse_podman_health(c: &serde_json::Value, state: &str) -> Option { c.get("Status") .and_then(|v| v.as_str()) @@ -1173,4 +1324,47 @@ mod tests { Some("unhealthy") ); } + + #[test] + fn parses_podman_healthcheck_systemd_units() { + let id = "c1f44a6369c91d65f9e9f6134a5591aa02792cff2f1a4e0f689b5a6c03b6c77c"; + assert_eq!( + parse_podman_healthcheck_unit(&format!("{}-15c66ddfefa8a763.timer", id)), + Some(id) + ); + assert_eq!( + parse_podman_healthcheck_unit(&format!("{}-15c66ddfefa8a763.service", id)), + Some(id) + ); + assert_eq!(parse_podman_healthcheck_unit("grafana.service"), None); + assert_eq!( + parse_podman_healthcheck_unit("nothexzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz-x.timer"), + None + ); + } + + #[test] + fn stale_healthcheck_units_filters_only_removed_container_ids() { + let live = "6467e25fd87d791a63fe9dbf6e2fabc7bf26533aa2c402b1089effeacf7ebbba"; + let stale = "c1f44a6369c91d65f9e9f6134a5591aa02792cff2f1a4e0f689b5a6c03b6c77c"; + let mut live_ids = HashSet::new(); + live_ids.insert(live.to_string()); + + let output = format!( + " {live}-6fdc497fd3ba3b62.timer loaded active waiting\n\ + ● {stale}-15c66ddfefa8a763.service loaded failed failed\n\ + grafana.service loaded active running\n\ + {stale}-1898d85de0bb707f.timer loaded active waiting\n" + ); + + let mut units = stale_healthcheck_units(&output, &live_ids); + units.sort(); + assert_eq!( + units, + vec![ + format!("{stale}-15c66ddfefa8a763.service"), + format!("{stale}-1898d85de0bb707f.timer"), + ] + ); + } } diff --git a/image-recipe/_archived/build-auto-installer-iso.sh b/image-recipe/_archived/build-auto-installer-iso.sh index a4f647cb..f386063f 100755 --- a/image-recipe/_archived/build-auto-installer-iso.sh +++ b/image-recipe/_archived/build-auto-installer-iso.sh @@ -1100,12 +1100,15 @@ if [ "$BACKEND_CAPTURED" = "0" ]; then FROM rust:1.93-trixie as builder WORKDIR /build COPY core ./core +COPY scripts ./scripts +COPY image-recipe/configs ./image-recipe/configs RUN cd core && cargo build --release --bin archipelago BACKENDFILE - if $CONTAINER_CMD build --platform $CONTAINER_PLATFORM -t archipelago-backend -f "$BACKEND_DOCKERFILE" "$SCRIPT_DIR/.." 2>&1 | tail -20; then + BACKEND_IMAGE="localhost/archipelago-backend:iso" + if $CONTAINER_CMD build --platform $CONTAINER_PLATFORM -t "$BACKEND_IMAGE" -f "$BACKEND_DOCKERFILE" "$SCRIPT_DIR/.."; then echo " Extracting backend binary..." - BACKEND_CONTAINER=$($CONTAINER_CMD create --platform $CONTAINER_PLATFORM archipelago-backend) + BACKEND_CONTAINER=$($CONTAINER_CMD create --platform $CONTAINER_PLATFORM "$BACKEND_IMAGE") $CONTAINER_CMD cp "$BACKEND_CONTAINER:/build/core/target/release/archipelago" "$ARCH_DIR/bin/" && \ echo " ✅ Backend binary built ($(du -h "$ARCH_DIR/bin/archipelago" | cut -f1))" $CONTAINER_CMD rm "$BACKEND_CONTAINER" diff --git a/neode-ui/src/views/AppDetails.vue b/neode-ui/src/views/AppDetails.vue index e8d65a52..4c0192cb 100644 --- a/neode-ui/src/views/AppDetails.vue +++ b/neode-ui/src/views/AppDetails.vue @@ -140,6 +140,7 @@ import { rpcClient } from '@/api/rpc-client' import AppHeroSection from './appDetails/AppHeroSection.vue' import AppContentSection from './appDetails/AppContentSection.vue' import AppSidebar from './appDetails/AppSidebar.vue' +import { resolveAppUrl } from './appSession/appSessionConfig' import { WEB_ONLY_APP_URLS, PACKAGE_ALIASES, @@ -266,7 +267,9 @@ const backButtonText = computed(() => { const canLaunch = computed(() => { if (!pkg.value) return false if (isWebOnly.value) return true - const hasUI = !!(pkg.value.manifest.interfaces?.main?.ui || pkg.value.installed?.['interface-addresses']?.main) + const hasRuntimeAddress = !!pkg.value.installed?.['interface-addresses']?.main?.['lan-address'] + const hasKnownLaunchUrl = typeof window !== 'undefined' && !!resolveAppUrl(pkg.value.manifest.id) + const hasUI = !!(pkg.value.manifest.interfaces?.main?.ui || hasRuntimeAddress || hasKnownLaunchUrl) return hasUI && pkg.value.state === 'running' && pkg.value.health !== 'starting' && pkg.value.health !== 'unhealthy' }) diff --git a/neode-ui/src/views/apps/appsConfig.ts b/neode-ui/src/views/apps/appsConfig.ts index ea138b4b..9ab1d91e 100644 --- a/neode-ui/src/views/apps/appsConfig.ts +++ b/neode-ui/src/views/apps/appsConfig.ts @@ -3,6 +3,7 @@ import type { Ref } from 'vue' import { computed } from 'vue' import { PackageState, type PackageDataEntry } from '@/types/api' +import { resolveAppUrl } from '../appSession/appSessionConfig' // Service container name patterns (backend/infra, not user-facing) export const SERVICE_NAMES = new Set([ @@ -144,7 +145,9 @@ export function resolveAppIcon(id: string, pkg: PackageDataEntry, curatedIcon?: export function canLaunch(pkg: PackageDataEntry): boolean { if (isWebOnlyApp(pkg.manifest.id)) return true - const hasUI = pkg.manifest.interfaces?.main?.ui || pkg.installed?.['interface-addresses']?.main + const hasRuntimeAddress = !!pkg.installed?.['interface-addresses']?.main?.['lan-address'] + const hasKnownLaunchUrl = typeof window !== 'undefined' && !!resolveAppUrl(pkg.manifest.id) + const hasUI = pkg.manifest.interfaces?.main?.ui || hasRuntimeAddress || hasKnownLaunchUrl return !!hasUI && pkg.state === 'running' && pkg.health !== 'starting' && pkg.health !== 'unhealthy' } diff --git a/scripts/container-specs.sh b/scripts/container-specs.sh index b13e60a1..95537f21 100755 --- a/scripts/container-specs.sh +++ b/scripts/container-specs.sh @@ -173,9 +173,9 @@ load_spec_bitcoin-knots() { SPEC_DATA_UID="100101:100101" # Dynamic: prune on small disk if [ "${DISK_GB:-0}" -lt 1000 ]; then - SPEC_CUSTOM_ARGS="-server=1 -prune=550 -rpcallowip=0.0.0.0/0 -rpcbind=0.0.0.0:8332 -listen=1 -bind=0.0.0.0:8333 -dbcache=512" + SPEC_CUSTOM_ARGS="-server=1 -prune=550 -rpcallowip=0.0.0.0/0 -rpcbind=0.0.0.0:8332 -listen=1 -bind=0.0.0.0:8333 -dbcache=2048 -par=0 -maxconnections=125" else - SPEC_CUSTOM_ARGS="-server=1 -txindex=1 -rpcallowip=0.0.0.0/0 -rpcbind=0.0.0.0:8332 -listen=1 -bind=0.0.0.0:8333 -dbcache=4096" + SPEC_CUSTOM_ARGS="-server=1 -txindex=1 -rpcallowip=0.0.0.0/0 -rpcbind=0.0.0.0:8332 -listen=1 -bind=0.0.0.0:8333 -dbcache=4096 -par=0 -maxconnections=125" fi } diff --git a/scripts/first-boot-containers.sh b/scripts/first-boot-containers.sh index 2e063001..3a6ddb80 100755 --- a/scripts/first-boot-containers.sh +++ b/scripts/first-boot-containers.sh @@ -557,11 +557,11 @@ if ! $DOCKER ps --format '{{.Names}}' 2>/dev/null | grep -qE 'bitcoin-knots|arch [ -z "$DISK_GB" ] && DISK_GB=$(df --output=size -BG / 2>/dev/null | tail -1 | tr -dc '0-9') if [ "${DISK_GB:-0}" -lt 1000 ]; then BTC_EXTRA_ARGS="-prune=550" - BTC_DBCACHE=512 + BTC_DBCACHE=2048 log " Small disk (${DISK_GB}GB) — enabling pruning" else BTC_EXTRA_ARGS="-txindex=1" - BTC_DBCACHE=2048 + BTC_DBCACHE=4096 log " Large disk (${DISK_GB}GB) — enabling txindex" fi if $DOCKER run -d --name bitcoin-knots --restart unless-stopped \ @@ -574,7 +574,7 @@ if ! $DOCKER ps --format '{{.Names}}' 2>/dev/null | grep -qE 'bitcoin-knots|arch -v /var/lib/archipelago/bitcoin:/home/bitcoin/.bitcoin \ "${BITCOIN_KNOTS_IMAGE}" \ $BTC_EXTRA_ARGS \ - -printtoconsole=1 -dbcache=$BTC_DBCACHE 2>>"$LOG"; then + -printtoconsole=1 -dbcache=$BTC_DBCACHE -par=0 -maxconnections=125 2>>"$LOG"; then log "Bitcoin Knots started" else log "Bitcoin Knots failed (may already exist)"