fix: harden container reconcile and launch behavior

This commit is contained in:
Dorian 2026-05-13 22:59:55 -04:00
parent 835c525218
commit 2ff47f88a7
9 changed files with 259 additions and 11 deletions

View File

@ -7,6 +7,14 @@
# Allow demo assets (AIUI pre-built dist)
!demo/
# Allow backend source for ISO source builds
!core/
!scripts/
!image-recipe/
image-recipe/build/
image-recipe/results/
image-recipe/output/
# Exclude nested node_modules (will npm install in container)
neode-ui/node_modules
neode-ui/dist

View File

@ -30,7 +30,7 @@ app:
RPC_PASS="$(printenv BITCOIN_RPC_PASS)";
DISK_GB_VALUE="$(printenv DISK_GB || true)";
if [ "${DISK_GB_VALUE:-0}" -lt 1000 ]; then
exec "$BITCOIND" -datadir=/home/bitcoin/.bitcoin -noconf -server=1 -prune=550 -rpcallowip=0.0.0.0/0 -rpcbind=0.0.0.0:8332 -listen=1 -bind=0.0.0.0:8333 -dbcache=1024 -par=0 -maxconnections=125 -rpcuser="$RPC_USER" -rpcpassword="$RPC_PASS";
exec "$BITCOIND" -datadir=/home/bitcoin/.bitcoin -noconf -server=1 -prune=550 -rpcallowip=0.0.0.0/0 -rpcbind=0.0.0.0:8332 -listen=1 -bind=0.0.0.0:8333 -dbcache=2048 -par=0 -maxconnections=125 -rpcuser="$RPC_USER" -rpcpassword="$RPC_PASS";
else
exec "$BITCOIND" -datadir=/home/bitcoin/.bitcoin -noconf -server=1 -txindex=1 -rpcallowip=0.0.0.0/0 -rpcbind=0.0.0.0:8332 -listen=1 -bind=0.0.0.0:8333 -dbcache=4096 -par=0 -maxconnections=125 -rpcuser="$RPC_USER" -rpcpassword="$RPC_PASS";
fi

View File

@ -62,6 +62,19 @@ fn is_required_baseline_app(app_id: &str) -> bool {
)
}
fn is_restart_sensitive_app(app_id: &str) -> bool {
matches!(
app_id,
"bitcoin-knots"
| "bitcoin-core"
| "bitcoin"
| "lnd"
| "btcpay-server"
| "fedimint"
| "fedimint-gateway"
)
}
fn requires_archival_bitcoin(app_id: &str) -> bool {
matches!(
app_id,
@ -713,6 +726,17 @@ impl ProdContainerOrchestrator {
return Ok(ReconcileAction::Started);
}
if self.container_env_drifted(&name, &resolved_manifest).await {
if mode == ReconcileMode::ExistingOnly
&& is_restart_sensitive_app(&app_id)
{
tracing::info!(
app_id = %app_id,
container = %name,
"container drift detected during boot reconcile; leaving running restart-sensitive app untouched"
);
self.run_post_start_hooks(&app_id).await?;
return Ok(ReconcileAction::NoOp);
}
tracing::info!(app_id = %app_id, container = %name, "container env drift detected — recreating");
let _ = self.runtime.stop_container(&name).await;
let _ = self.runtime.remove_container(&name).await;
@ -2252,6 +2276,7 @@ mod tests {
runtime,
PathBuf::from("/nonexistent-for-tests"),
);
orch.set_data_dir(PathBuf::from("/nonexistent-for-tests"));
// Redirect the bitcoin-ui pre-start hook to a test-scoped
// tmpdir, seeded with a fake password file. Shared across
// every test in this module (OnceLock), so the hook can run
@ -2259,6 +2284,7 @@ mod tests {
// this redirection, any test that installs the bitcoin-ui
// fixture would try to write under /var/lib/archipelago.
orch.set_bitcoin_ui_paths(test_bitcoin_ui_paths());
orch.set_filebrowser_paths(test_filebrowser_paths());
orch
}
@ -2339,6 +2365,17 @@ app:
}
}
fn test_filebrowser_paths() -> filebrowser::EnsurePaths {
use std::sync::OnceLock;
static DIR: OnceLock<tempfile::TempDir> = OnceLock::new();
let dir = DIR.get_or_init(|| tempfile::TempDir::new().expect("test tmpdir"));
filebrowser::EnsurePaths {
srv_root: dir.path().join("filebrowser"),
data_dir: dir.path().join("filebrowser-data"),
config_path: dir.path().join("filebrowser-data/.filebrowser.json"),
}
}
#[tokio::test]
async fn install_fresh_pull() {
let rt = Arc::new(MockRuntime::default());

View File

@ -8,7 +8,7 @@ use crate::data_model::{Notification, NotificationLevel, PackageState};
use crate::state::StateManager;
use crate::webhooks::{self, WebhookEvent};
use serde::{Deserialize, Serialize};
use std::collections::HashMap;
use std::collections::{HashMap, HashSet};
use std::path::{Path, PathBuf};
use std::sync::Arc;
use std::time::Instant;
@ -420,6 +420,9 @@ async fn check_containers() -> Vec<ContainerHealth> {
let stdout = String::from_utf8_lossy(&output.stdout);
let containers: Vec<serde_json::Value> = serde_json::from_str(&stdout).unwrap_or_default();
let live_container_ids = live_container_ids(&containers);
cleanup_stale_podman_healthcheck_units(&live_container_ids).await;
// Monitor ALL long-running containers for health — backend services (databases,
// nbxplorer, mempool-api) and UI containers need auto-restart too.
// Only skip ephemeral containers (build infrastructure, init one-shots).
@ -462,6 +465,154 @@ async fn check_containers() -> Vec<ContainerHealth> {
.collect()
}
fn live_container_ids(containers: &[serde_json::Value]) -> HashSet<String> {
containers
.iter()
.filter_map(|c| {
c.get("Id")
.or_else(|| c.get("ID"))
.and_then(|v| v.as_str())
.map(|s| s.to_string())
})
.collect()
}
async fn cleanup_stale_podman_healthcheck_units(live_container_ids: &HashSet<String>) {
if live_container_ids.is_empty() {
return;
}
let mut units = stale_healthcheck_units_from_systemd(live_container_ids).await;
if units.is_empty() {
return;
}
units.sort();
units.dedup();
let mut cleaned = 0;
for unit in units {
let Some(container_id) = parse_podman_healthcheck_unit(&unit) else {
continue;
};
let service = format!("{}.service", unit.trim_end_matches(".timer"));
if stop_user_unit(&unit).await {
cleaned += 1;
}
let _ = stop_user_unit(&service).await;
let _ = reset_failed_user_unit(&service).await;
debug!(
"Stopped stale Podman healthcheck unit {} for removed container {}",
unit, container_id
);
}
if cleaned > 0 {
info!("Cleaned {} stale Podman healthcheck timer(s)", cleaned);
}
}
async fn stale_healthcheck_units_from_systemd(live_container_ids: &HashSet<String>) -> Vec<String> {
let mut units = Vec::new();
for args in [
["--user", "list-timers", "--all", "--no-legend", "--no-pager"].as_slice(),
["--user", "list-units", "--all", "--no-legend", "--no-pager"].as_slice(),
] {
let output = match tokio::time::timeout(
std::time::Duration::from_secs(20),
tokio::process::Command::new("systemctl")
.args(args.iter().copied())
.output(),
)
.await
{
Ok(Ok(output)) if output.status.success() => output,
Ok(Ok(output)) => {
let stderr = String::from_utf8_lossy(&output.stderr);
debug!("systemctl {} failed: {}", args.join(" "), stderr.trim());
continue;
}
Ok(Err(e)) => {
debug!("Failed to run systemctl {}: {}", args.join(" "), e);
continue;
}
Err(_) => {
debug!("systemctl {} timed out", args.join(" "));
continue;
}
};
let stdout = String::from_utf8_lossy(&output.stdout);
units.extend(stale_healthcheck_units(&stdout, live_container_ids));
}
units
}
fn stale_healthcheck_units(output: &str, live_container_ids: &HashSet<String>) -> Vec<String> {
output
.lines()
.flat_map(|line| line.split_whitespace())
.filter_map(|token| {
let unit = token.trim_start_matches('●');
let id = parse_podman_healthcheck_unit(unit)?;
(!live_container_ids.contains(id)).then(|| unit.to_string())
})
.collect()
}
fn parse_podman_healthcheck_unit(unit: &str) -> Option<&str> {
let unit = unit
.strip_suffix(".timer")
.or_else(|| unit.strip_suffix(".service"))?;
let (container_id, _suffix) = unit.split_once('-')?;
if container_id.len() == 64 && container_id.bytes().all(|b| b.is_ascii_hexdigit()) {
Some(container_id)
} else {
None
}
}
async fn stop_user_unit(unit: &str) -> bool {
run_systemctl_user(["stop", unit]).await
}
async fn reset_failed_user_unit(unit: &str) -> bool {
run_systemctl_user(["reset-failed", unit]).await
}
async fn run_systemctl_user<const N: usize>(args: [&str; N]) -> bool {
let output = match tokio::time::timeout(
std::time::Duration::from_secs(10),
tokio::process::Command::new("systemctl")
.arg("--user")
.args(args.iter().copied())
.output(),
)
.await
{
Ok(Ok(output)) => output,
Ok(Err(e)) => {
debug!("Failed to run systemctl --user {}: {}", args.join(" "), e);
return false;
}
Err(_) => {
debug!("systemctl --user {} timed out", args.join(" "));
return false;
}
};
if output.status.success() {
true
} else {
let stderr = String::from_utf8_lossy(&output.stderr);
debug!(
"systemctl --user {} failed: {}",
args.join(" "),
stderr.trim()
);
false
}
}
fn parse_podman_health(c: &serde_json::Value, state: &str) -> Option<String> {
c.get("Status")
.and_then(|v| v.as_str())
@ -1173,4 +1324,47 @@ mod tests {
Some("unhealthy")
);
}
#[test]
fn parses_podman_healthcheck_systemd_units() {
let id = "c1f44a6369c91d65f9e9f6134a5591aa02792cff2f1a4e0f689b5a6c03b6c77c";
assert_eq!(
parse_podman_healthcheck_unit(&format!("{}-15c66ddfefa8a763.timer", id)),
Some(id)
);
assert_eq!(
parse_podman_healthcheck_unit(&format!("{}-15c66ddfefa8a763.service", id)),
Some(id)
);
assert_eq!(parse_podman_healthcheck_unit("grafana.service"), None);
assert_eq!(
parse_podman_healthcheck_unit("nothexzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz-x.timer"),
None
);
}
#[test]
fn stale_healthcheck_units_filters_only_removed_container_ids() {
let live = "6467e25fd87d791a63fe9dbf6e2fabc7bf26533aa2c402b1089effeacf7ebbba";
let stale = "c1f44a6369c91d65f9e9f6134a5591aa02792cff2f1a4e0f689b5a6c03b6c77c";
let mut live_ids = HashSet::new();
live_ids.insert(live.to_string());
let output = format!(
" {live}-6fdc497fd3ba3b62.timer loaded active waiting\n\
{stale}-15c66ddfefa8a763.service loaded failed failed\n\
grafana.service loaded active running\n\
{stale}-1898d85de0bb707f.timer loaded active waiting\n"
);
let mut units = stale_healthcheck_units(&output, &live_ids);
units.sort();
assert_eq!(
units,
vec![
format!("{stale}-15c66ddfefa8a763.service"),
format!("{stale}-1898d85de0bb707f.timer"),
]
);
}
}

View File

@ -1100,12 +1100,15 @@ if [ "$BACKEND_CAPTURED" = "0" ]; then
FROM rust:1.93-trixie as builder
WORKDIR /build
COPY core ./core
COPY scripts ./scripts
COPY image-recipe/configs ./image-recipe/configs
RUN cd core && cargo build --release --bin archipelago
BACKENDFILE
if $CONTAINER_CMD build --platform $CONTAINER_PLATFORM -t archipelago-backend -f "$BACKEND_DOCKERFILE" "$SCRIPT_DIR/.." 2>&1 | tail -20; then
BACKEND_IMAGE="localhost/archipelago-backend:iso"
if $CONTAINER_CMD build --platform $CONTAINER_PLATFORM -t "$BACKEND_IMAGE" -f "$BACKEND_DOCKERFILE" "$SCRIPT_DIR/.."; then
echo " Extracting backend binary..."
BACKEND_CONTAINER=$($CONTAINER_CMD create --platform $CONTAINER_PLATFORM archipelago-backend)
BACKEND_CONTAINER=$($CONTAINER_CMD create --platform $CONTAINER_PLATFORM "$BACKEND_IMAGE")
$CONTAINER_CMD cp "$BACKEND_CONTAINER:/build/core/target/release/archipelago" "$ARCH_DIR/bin/" && \
echo " ✅ Backend binary built ($(du -h "$ARCH_DIR/bin/archipelago" | cut -f1))"
$CONTAINER_CMD rm "$BACKEND_CONTAINER"

View File

@ -140,6 +140,7 @@ import { rpcClient } from '@/api/rpc-client'
import AppHeroSection from './appDetails/AppHeroSection.vue'
import AppContentSection from './appDetails/AppContentSection.vue'
import AppSidebar from './appDetails/AppSidebar.vue'
import { resolveAppUrl } from './appSession/appSessionConfig'
import {
WEB_ONLY_APP_URLS,
PACKAGE_ALIASES,
@ -266,7 +267,9 @@ const backButtonText = computed(() => {
const canLaunch = computed(() => {
if (!pkg.value) return false
if (isWebOnly.value) return true
const hasUI = !!(pkg.value.manifest.interfaces?.main?.ui || pkg.value.installed?.['interface-addresses']?.main)
const hasRuntimeAddress = !!pkg.value.installed?.['interface-addresses']?.main?.['lan-address']
const hasKnownLaunchUrl = typeof window !== 'undefined' && !!resolveAppUrl(pkg.value.manifest.id)
const hasUI = !!(pkg.value.manifest.interfaces?.main?.ui || hasRuntimeAddress || hasKnownLaunchUrl)
return hasUI && pkg.value.state === 'running' && pkg.value.health !== 'starting' && pkg.value.health !== 'unhealthy'
})

View File

@ -3,6 +3,7 @@
import type { Ref } from 'vue'
import { computed } from 'vue'
import { PackageState, type PackageDataEntry } from '@/types/api'
import { resolveAppUrl } from '../appSession/appSessionConfig'
// Service container name patterns (backend/infra, not user-facing)
export const SERVICE_NAMES = new Set([
@ -144,7 +145,9 @@ export function resolveAppIcon(id: string, pkg: PackageDataEntry, curatedIcon?:
export function canLaunch(pkg: PackageDataEntry): boolean {
if (isWebOnlyApp(pkg.manifest.id)) return true
const hasUI = pkg.manifest.interfaces?.main?.ui || pkg.installed?.['interface-addresses']?.main
const hasRuntimeAddress = !!pkg.installed?.['interface-addresses']?.main?.['lan-address']
const hasKnownLaunchUrl = typeof window !== 'undefined' && !!resolveAppUrl(pkg.manifest.id)
const hasUI = pkg.manifest.interfaces?.main?.ui || hasRuntimeAddress || hasKnownLaunchUrl
return !!hasUI && pkg.state === 'running' && pkg.health !== 'starting' && pkg.health !== 'unhealthy'
}

View File

@ -173,9 +173,9 @@ load_spec_bitcoin-knots() {
SPEC_DATA_UID="100101:100101"
# Dynamic: prune on small disk
if [ "${DISK_GB:-0}" -lt 1000 ]; then
SPEC_CUSTOM_ARGS="-server=1 -prune=550 -rpcallowip=0.0.0.0/0 -rpcbind=0.0.0.0:8332 -listen=1 -bind=0.0.0.0:8333 -dbcache=512"
SPEC_CUSTOM_ARGS="-server=1 -prune=550 -rpcallowip=0.0.0.0/0 -rpcbind=0.0.0.0:8332 -listen=1 -bind=0.0.0.0:8333 -dbcache=2048 -par=0 -maxconnections=125"
else
SPEC_CUSTOM_ARGS="-server=1 -txindex=1 -rpcallowip=0.0.0.0/0 -rpcbind=0.0.0.0:8332 -listen=1 -bind=0.0.0.0:8333 -dbcache=4096"
SPEC_CUSTOM_ARGS="-server=1 -txindex=1 -rpcallowip=0.0.0.0/0 -rpcbind=0.0.0.0:8332 -listen=1 -bind=0.0.0.0:8333 -dbcache=4096 -par=0 -maxconnections=125"
fi
}

View File

@ -557,11 +557,11 @@ if ! $DOCKER ps --format '{{.Names}}' 2>/dev/null | grep -qE 'bitcoin-knots|arch
[ -z "$DISK_GB" ] && DISK_GB=$(df --output=size -BG / 2>/dev/null | tail -1 | tr -dc '0-9')
if [ "${DISK_GB:-0}" -lt 1000 ]; then
BTC_EXTRA_ARGS="-prune=550"
BTC_DBCACHE=512
BTC_DBCACHE=2048
log " Small disk (${DISK_GB}GB) — enabling pruning"
else
BTC_EXTRA_ARGS="-txindex=1"
BTC_DBCACHE=2048
BTC_DBCACHE=4096
log " Large disk (${DISK_GB}GB) — enabling txindex"
fi
if $DOCKER run -d --name bitcoin-knots --restart unless-stopped \
@ -574,7 +574,7 @@ if ! $DOCKER ps --format '{{.Names}}' 2>/dev/null | grep -qE 'bitcoin-knots|arch
-v /var/lib/archipelago/bitcoin:/home/bitcoin/.bitcoin \
"${BITCOIN_KNOTS_IMAGE}" \
$BTC_EXTRA_ARGS \
-printtoconsole=1 -dbcache=$BTC_DBCACHE 2>>"$LOG"; then
-printtoconsole=1 -dbcache=$BTC_DBCACHE -par=0 -maxconnections=125 2>>"$LOG"; then
log "Bitcoin Knots started"
else
log "Bitcoin Knots failed (may already exist)"