fix: harden container reconcile and launch behavior
This commit is contained in:
parent
835c525218
commit
2ff47f88a7
@ -7,6 +7,14 @@
|
||||
# Allow demo assets (AIUI pre-built dist)
|
||||
!demo/
|
||||
|
||||
# Allow backend source for ISO source builds
|
||||
!core/
|
||||
!scripts/
|
||||
!image-recipe/
|
||||
image-recipe/build/
|
||||
image-recipe/results/
|
||||
image-recipe/output/
|
||||
|
||||
# Exclude nested node_modules (will npm install in container)
|
||||
neode-ui/node_modules
|
||||
neode-ui/dist
|
||||
|
||||
@ -30,7 +30,7 @@ app:
|
||||
RPC_PASS="$(printenv BITCOIN_RPC_PASS)";
|
||||
DISK_GB_VALUE="$(printenv DISK_GB || true)";
|
||||
if [ "${DISK_GB_VALUE:-0}" -lt 1000 ]; then
|
||||
exec "$BITCOIND" -datadir=/home/bitcoin/.bitcoin -noconf -server=1 -prune=550 -rpcallowip=0.0.0.0/0 -rpcbind=0.0.0.0:8332 -listen=1 -bind=0.0.0.0:8333 -dbcache=1024 -par=0 -maxconnections=125 -rpcuser="$RPC_USER" -rpcpassword="$RPC_PASS";
|
||||
exec "$BITCOIND" -datadir=/home/bitcoin/.bitcoin -noconf -server=1 -prune=550 -rpcallowip=0.0.0.0/0 -rpcbind=0.0.0.0:8332 -listen=1 -bind=0.0.0.0:8333 -dbcache=2048 -par=0 -maxconnections=125 -rpcuser="$RPC_USER" -rpcpassword="$RPC_PASS";
|
||||
else
|
||||
exec "$BITCOIND" -datadir=/home/bitcoin/.bitcoin -noconf -server=1 -txindex=1 -rpcallowip=0.0.0.0/0 -rpcbind=0.0.0.0:8332 -listen=1 -bind=0.0.0.0:8333 -dbcache=4096 -par=0 -maxconnections=125 -rpcuser="$RPC_USER" -rpcpassword="$RPC_PASS";
|
||||
fi
|
||||
|
||||
@ -62,6 +62,19 @@ fn is_required_baseline_app(app_id: &str) -> bool {
|
||||
)
|
||||
}
|
||||
|
||||
fn is_restart_sensitive_app(app_id: &str) -> bool {
|
||||
matches!(
|
||||
app_id,
|
||||
"bitcoin-knots"
|
||||
| "bitcoin-core"
|
||||
| "bitcoin"
|
||||
| "lnd"
|
||||
| "btcpay-server"
|
||||
| "fedimint"
|
||||
| "fedimint-gateway"
|
||||
)
|
||||
}
|
||||
|
||||
fn requires_archival_bitcoin(app_id: &str) -> bool {
|
||||
matches!(
|
||||
app_id,
|
||||
@ -713,6 +726,17 @@ impl ProdContainerOrchestrator {
|
||||
return Ok(ReconcileAction::Started);
|
||||
}
|
||||
if self.container_env_drifted(&name, &resolved_manifest).await {
|
||||
if mode == ReconcileMode::ExistingOnly
|
||||
&& is_restart_sensitive_app(&app_id)
|
||||
{
|
||||
tracing::info!(
|
||||
app_id = %app_id,
|
||||
container = %name,
|
||||
"container drift detected during boot reconcile; leaving running restart-sensitive app untouched"
|
||||
);
|
||||
self.run_post_start_hooks(&app_id).await?;
|
||||
return Ok(ReconcileAction::NoOp);
|
||||
}
|
||||
tracing::info!(app_id = %app_id, container = %name, "container env drift detected — recreating");
|
||||
let _ = self.runtime.stop_container(&name).await;
|
||||
let _ = self.runtime.remove_container(&name).await;
|
||||
@ -2252,6 +2276,7 @@ mod tests {
|
||||
runtime,
|
||||
PathBuf::from("/nonexistent-for-tests"),
|
||||
);
|
||||
orch.set_data_dir(PathBuf::from("/nonexistent-for-tests"));
|
||||
// Redirect the bitcoin-ui pre-start hook to a test-scoped
|
||||
// tmpdir, seeded with a fake password file. Shared across
|
||||
// every test in this module (OnceLock), so the hook can run
|
||||
@ -2259,6 +2284,7 @@ mod tests {
|
||||
// this redirection, any test that installs the bitcoin-ui
|
||||
// fixture would try to write under /var/lib/archipelago.
|
||||
orch.set_bitcoin_ui_paths(test_bitcoin_ui_paths());
|
||||
orch.set_filebrowser_paths(test_filebrowser_paths());
|
||||
orch
|
||||
}
|
||||
|
||||
@ -2339,6 +2365,17 @@ app:
|
||||
}
|
||||
}
|
||||
|
||||
fn test_filebrowser_paths() -> filebrowser::EnsurePaths {
|
||||
use std::sync::OnceLock;
|
||||
static DIR: OnceLock<tempfile::TempDir> = OnceLock::new();
|
||||
let dir = DIR.get_or_init(|| tempfile::TempDir::new().expect("test tmpdir"));
|
||||
filebrowser::EnsurePaths {
|
||||
srv_root: dir.path().join("filebrowser"),
|
||||
data_dir: dir.path().join("filebrowser-data"),
|
||||
config_path: dir.path().join("filebrowser-data/.filebrowser.json"),
|
||||
}
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn install_fresh_pull() {
|
||||
let rt = Arc::new(MockRuntime::default());
|
||||
|
||||
@ -8,7 +8,7 @@ use crate::data_model::{Notification, NotificationLevel, PackageState};
|
||||
use crate::state::StateManager;
|
||||
use crate::webhooks::{self, WebhookEvent};
|
||||
use serde::{Deserialize, Serialize};
|
||||
use std::collections::HashMap;
|
||||
use std::collections::{HashMap, HashSet};
|
||||
use std::path::{Path, PathBuf};
|
||||
use std::sync::Arc;
|
||||
use std::time::Instant;
|
||||
@ -420,6 +420,9 @@ async fn check_containers() -> Vec<ContainerHealth> {
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
let containers: Vec<serde_json::Value> = serde_json::from_str(&stdout).unwrap_or_default();
|
||||
|
||||
let live_container_ids = live_container_ids(&containers);
|
||||
cleanup_stale_podman_healthcheck_units(&live_container_ids).await;
|
||||
|
||||
// Monitor ALL long-running containers for health — backend services (databases,
|
||||
// nbxplorer, mempool-api) and UI containers need auto-restart too.
|
||||
// Only skip ephemeral containers (build infrastructure, init one-shots).
|
||||
@ -462,6 +465,154 @@ async fn check_containers() -> Vec<ContainerHealth> {
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn live_container_ids(containers: &[serde_json::Value]) -> HashSet<String> {
|
||||
containers
|
||||
.iter()
|
||||
.filter_map(|c| {
|
||||
c.get("Id")
|
||||
.or_else(|| c.get("ID"))
|
||||
.and_then(|v| v.as_str())
|
||||
.map(|s| s.to_string())
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
async fn cleanup_stale_podman_healthcheck_units(live_container_ids: &HashSet<String>) {
|
||||
if live_container_ids.is_empty() {
|
||||
return;
|
||||
}
|
||||
|
||||
let mut units = stale_healthcheck_units_from_systemd(live_container_ids).await;
|
||||
if units.is_empty() {
|
||||
return;
|
||||
}
|
||||
units.sort();
|
||||
units.dedup();
|
||||
|
||||
let mut cleaned = 0;
|
||||
for unit in units {
|
||||
let Some(container_id) = parse_podman_healthcheck_unit(&unit) else {
|
||||
continue;
|
||||
};
|
||||
let service = format!("{}.service", unit.trim_end_matches(".timer"));
|
||||
if stop_user_unit(&unit).await {
|
||||
cleaned += 1;
|
||||
}
|
||||
let _ = stop_user_unit(&service).await;
|
||||
let _ = reset_failed_user_unit(&service).await;
|
||||
debug!(
|
||||
"Stopped stale Podman healthcheck unit {} for removed container {}",
|
||||
unit, container_id
|
||||
);
|
||||
}
|
||||
|
||||
if cleaned > 0 {
|
||||
info!("Cleaned {} stale Podman healthcheck timer(s)", cleaned);
|
||||
}
|
||||
}
|
||||
|
||||
async fn stale_healthcheck_units_from_systemd(live_container_ids: &HashSet<String>) -> Vec<String> {
|
||||
let mut units = Vec::new();
|
||||
for args in [
|
||||
["--user", "list-timers", "--all", "--no-legend", "--no-pager"].as_slice(),
|
||||
["--user", "list-units", "--all", "--no-legend", "--no-pager"].as_slice(),
|
||||
] {
|
||||
let output = match tokio::time::timeout(
|
||||
std::time::Duration::from_secs(20),
|
||||
tokio::process::Command::new("systemctl")
|
||||
.args(args.iter().copied())
|
||||
.output(),
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(Ok(output)) if output.status.success() => output,
|
||||
Ok(Ok(output)) => {
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
debug!("systemctl {} failed: {}", args.join(" "), stderr.trim());
|
||||
continue;
|
||||
}
|
||||
Ok(Err(e)) => {
|
||||
debug!("Failed to run systemctl {}: {}", args.join(" "), e);
|
||||
continue;
|
||||
}
|
||||
Err(_) => {
|
||||
debug!("systemctl {} timed out", args.join(" "));
|
||||
continue;
|
||||
}
|
||||
};
|
||||
|
||||
let stdout = String::from_utf8_lossy(&output.stdout);
|
||||
units.extend(stale_healthcheck_units(&stdout, live_container_ids));
|
||||
}
|
||||
units
|
||||
}
|
||||
|
||||
fn stale_healthcheck_units(output: &str, live_container_ids: &HashSet<String>) -> Vec<String> {
|
||||
output
|
||||
.lines()
|
||||
.flat_map(|line| line.split_whitespace())
|
||||
.filter_map(|token| {
|
||||
let unit = token.trim_start_matches('●');
|
||||
let id = parse_podman_healthcheck_unit(unit)?;
|
||||
(!live_container_ids.contains(id)).then(|| unit.to_string())
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
|
||||
fn parse_podman_healthcheck_unit(unit: &str) -> Option<&str> {
|
||||
let unit = unit
|
||||
.strip_suffix(".timer")
|
||||
.or_else(|| unit.strip_suffix(".service"))?;
|
||||
let (container_id, _suffix) = unit.split_once('-')?;
|
||||
if container_id.len() == 64 && container_id.bytes().all(|b| b.is_ascii_hexdigit()) {
|
||||
Some(container_id)
|
||||
} else {
|
||||
None
|
||||
}
|
||||
}
|
||||
|
||||
async fn stop_user_unit(unit: &str) -> bool {
|
||||
run_systemctl_user(["stop", unit]).await
|
||||
}
|
||||
|
||||
async fn reset_failed_user_unit(unit: &str) -> bool {
|
||||
run_systemctl_user(["reset-failed", unit]).await
|
||||
}
|
||||
|
||||
async fn run_systemctl_user<const N: usize>(args: [&str; N]) -> bool {
|
||||
let output = match tokio::time::timeout(
|
||||
std::time::Duration::from_secs(10),
|
||||
tokio::process::Command::new("systemctl")
|
||||
.arg("--user")
|
||||
.args(args.iter().copied())
|
||||
.output(),
|
||||
)
|
||||
.await
|
||||
{
|
||||
Ok(Ok(output)) => output,
|
||||
Ok(Err(e)) => {
|
||||
debug!("Failed to run systemctl --user {}: {}", args.join(" "), e);
|
||||
return false;
|
||||
}
|
||||
Err(_) => {
|
||||
debug!("systemctl --user {} timed out", args.join(" "));
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
if output.status.success() {
|
||||
true
|
||||
} else {
|
||||
let stderr = String::from_utf8_lossy(&output.stderr);
|
||||
debug!(
|
||||
"systemctl --user {} failed: {}",
|
||||
args.join(" "),
|
||||
stderr.trim()
|
||||
);
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
fn parse_podman_health(c: &serde_json::Value, state: &str) -> Option<String> {
|
||||
c.get("Status")
|
||||
.and_then(|v| v.as_str())
|
||||
@ -1173,4 +1324,47 @@ mod tests {
|
||||
Some("unhealthy")
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn parses_podman_healthcheck_systemd_units() {
|
||||
let id = "c1f44a6369c91d65f9e9f6134a5591aa02792cff2f1a4e0f689b5a6c03b6c77c";
|
||||
assert_eq!(
|
||||
parse_podman_healthcheck_unit(&format!("{}-15c66ddfefa8a763.timer", id)),
|
||||
Some(id)
|
||||
);
|
||||
assert_eq!(
|
||||
parse_podman_healthcheck_unit(&format!("{}-15c66ddfefa8a763.service", id)),
|
||||
Some(id)
|
||||
);
|
||||
assert_eq!(parse_podman_healthcheck_unit("grafana.service"), None);
|
||||
assert_eq!(
|
||||
parse_podman_healthcheck_unit("nothexzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz-x.timer"),
|
||||
None
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn stale_healthcheck_units_filters_only_removed_container_ids() {
|
||||
let live = "6467e25fd87d791a63fe9dbf6e2fabc7bf26533aa2c402b1089effeacf7ebbba";
|
||||
let stale = "c1f44a6369c91d65f9e9f6134a5591aa02792cff2f1a4e0f689b5a6c03b6c77c";
|
||||
let mut live_ids = HashSet::new();
|
||||
live_ids.insert(live.to_string());
|
||||
|
||||
let output = format!(
|
||||
" {live}-6fdc497fd3ba3b62.timer loaded active waiting\n\
|
||||
● {stale}-15c66ddfefa8a763.service loaded failed failed\n\
|
||||
grafana.service loaded active running\n\
|
||||
{stale}-1898d85de0bb707f.timer loaded active waiting\n"
|
||||
);
|
||||
|
||||
let mut units = stale_healthcheck_units(&output, &live_ids);
|
||||
units.sort();
|
||||
assert_eq!(
|
||||
units,
|
||||
vec![
|
||||
format!("{stale}-15c66ddfefa8a763.service"),
|
||||
format!("{stale}-1898d85de0bb707f.timer"),
|
||||
]
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
@ -1100,12 +1100,15 @@ if [ "$BACKEND_CAPTURED" = "0" ]; then
|
||||
FROM rust:1.93-trixie as builder
|
||||
WORKDIR /build
|
||||
COPY core ./core
|
||||
COPY scripts ./scripts
|
||||
COPY image-recipe/configs ./image-recipe/configs
|
||||
RUN cd core && cargo build --release --bin archipelago
|
||||
BACKENDFILE
|
||||
|
||||
if $CONTAINER_CMD build --platform $CONTAINER_PLATFORM -t archipelago-backend -f "$BACKEND_DOCKERFILE" "$SCRIPT_DIR/.." 2>&1 | tail -20; then
|
||||
BACKEND_IMAGE="localhost/archipelago-backend:iso"
|
||||
if $CONTAINER_CMD build --platform $CONTAINER_PLATFORM -t "$BACKEND_IMAGE" -f "$BACKEND_DOCKERFILE" "$SCRIPT_DIR/.."; then
|
||||
echo " Extracting backend binary..."
|
||||
BACKEND_CONTAINER=$($CONTAINER_CMD create --platform $CONTAINER_PLATFORM archipelago-backend)
|
||||
BACKEND_CONTAINER=$($CONTAINER_CMD create --platform $CONTAINER_PLATFORM "$BACKEND_IMAGE")
|
||||
$CONTAINER_CMD cp "$BACKEND_CONTAINER:/build/core/target/release/archipelago" "$ARCH_DIR/bin/" && \
|
||||
echo " ✅ Backend binary built ($(du -h "$ARCH_DIR/bin/archipelago" | cut -f1))"
|
||||
$CONTAINER_CMD rm "$BACKEND_CONTAINER"
|
||||
|
||||
@ -140,6 +140,7 @@ import { rpcClient } from '@/api/rpc-client'
|
||||
import AppHeroSection from './appDetails/AppHeroSection.vue'
|
||||
import AppContentSection from './appDetails/AppContentSection.vue'
|
||||
import AppSidebar from './appDetails/AppSidebar.vue'
|
||||
import { resolveAppUrl } from './appSession/appSessionConfig'
|
||||
import {
|
||||
WEB_ONLY_APP_URLS,
|
||||
PACKAGE_ALIASES,
|
||||
@ -266,7 +267,9 @@ const backButtonText = computed(() => {
|
||||
const canLaunch = computed(() => {
|
||||
if (!pkg.value) return false
|
||||
if (isWebOnly.value) return true
|
||||
const hasUI = !!(pkg.value.manifest.interfaces?.main?.ui || pkg.value.installed?.['interface-addresses']?.main)
|
||||
const hasRuntimeAddress = !!pkg.value.installed?.['interface-addresses']?.main?.['lan-address']
|
||||
const hasKnownLaunchUrl = typeof window !== 'undefined' && !!resolveAppUrl(pkg.value.manifest.id)
|
||||
const hasUI = !!(pkg.value.manifest.interfaces?.main?.ui || hasRuntimeAddress || hasKnownLaunchUrl)
|
||||
return hasUI && pkg.value.state === 'running' && pkg.value.health !== 'starting' && pkg.value.health !== 'unhealthy'
|
||||
})
|
||||
|
||||
|
||||
@ -3,6 +3,7 @@
|
||||
import type { Ref } from 'vue'
|
||||
import { computed } from 'vue'
|
||||
import { PackageState, type PackageDataEntry } from '@/types/api'
|
||||
import { resolveAppUrl } from '../appSession/appSessionConfig'
|
||||
|
||||
// Service container name patterns (backend/infra, not user-facing)
|
||||
export const SERVICE_NAMES = new Set([
|
||||
@ -144,7 +145,9 @@ export function resolveAppIcon(id: string, pkg: PackageDataEntry, curatedIcon?:
|
||||
|
||||
export function canLaunch(pkg: PackageDataEntry): boolean {
|
||||
if (isWebOnlyApp(pkg.manifest.id)) return true
|
||||
const hasUI = pkg.manifest.interfaces?.main?.ui || pkg.installed?.['interface-addresses']?.main
|
||||
const hasRuntimeAddress = !!pkg.installed?.['interface-addresses']?.main?.['lan-address']
|
||||
const hasKnownLaunchUrl = typeof window !== 'undefined' && !!resolveAppUrl(pkg.manifest.id)
|
||||
const hasUI = pkg.manifest.interfaces?.main?.ui || hasRuntimeAddress || hasKnownLaunchUrl
|
||||
return !!hasUI && pkg.state === 'running' && pkg.health !== 'starting' && pkg.health !== 'unhealthy'
|
||||
}
|
||||
|
||||
|
||||
@ -173,9 +173,9 @@ load_spec_bitcoin-knots() {
|
||||
SPEC_DATA_UID="100101:100101"
|
||||
# Dynamic: prune on small disk
|
||||
if [ "${DISK_GB:-0}" -lt 1000 ]; then
|
||||
SPEC_CUSTOM_ARGS="-server=1 -prune=550 -rpcallowip=0.0.0.0/0 -rpcbind=0.0.0.0:8332 -listen=1 -bind=0.0.0.0:8333 -dbcache=512"
|
||||
SPEC_CUSTOM_ARGS="-server=1 -prune=550 -rpcallowip=0.0.0.0/0 -rpcbind=0.0.0.0:8332 -listen=1 -bind=0.0.0.0:8333 -dbcache=2048 -par=0 -maxconnections=125"
|
||||
else
|
||||
SPEC_CUSTOM_ARGS="-server=1 -txindex=1 -rpcallowip=0.0.0.0/0 -rpcbind=0.0.0.0:8332 -listen=1 -bind=0.0.0.0:8333 -dbcache=4096"
|
||||
SPEC_CUSTOM_ARGS="-server=1 -txindex=1 -rpcallowip=0.0.0.0/0 -rpcbind=0.0.0.0:8332 -listen=1 -bind=0.0.0.0:8333 -dbcache=4096 -par=0 -maxconnections=125"
|
||||
fi
|
||||
}
|
||||
|
||||
|
||||
@ -557,11 +557,11 @@ if ! $DOCKER ps --format '{{.Names}}' 2>/dev/null | grep -qE 'bitcoin-knots|arch
|
||||
[ -z "$DISK_GB" ] && DISK_GB=$(df --output=size -BG / 2>/dev/null | tail -1 | tr -dc '0-9')
|
||||
if [ "${DISK_GB:-0}" -lt 1000 ]; then
|
||||
BTC_EXTRA_ARGS="-prune=550"
|
||||
BTC_DBCACHE=512
|
||||
BTC_DBCACHE=2048
|
||||
log " Small disk (${DISK_GB}GB) — enabling pruning"
|
||||
else
|
||||
BTC_EXTRA_ARGS="-txindex=1"
|
||||
BTC_DBCACHE=2048
|
||||
BTC_DBCACHE=4096
|
||||
log " Large disk (${DISK_GB}GB) — enabling txindex"
|
||||
fi
|
||||
if $DOCKER run -d --name bitcoin-knots --restart unless-stopped \
|
||||
@ -574,7 +574,7 @@ if ! $DOCKER ps --format '{{.Names}}' 2>/dev/null | grep -qE 'bitcoin-knots|arch
|
||||
-v /var/lib/archipelago/bitcoin:/home/bitcoin/.bitcoin \
|
||||
"${BITCOIN_KNOTS_IMAGE}" \
|
||||
$BTC_EXTRA_ARGS \
|
||||
-printtoconsole=1 -dbcache=$BTC_DBCACHE 2>>"$LOG"; then
|
||||
-printtoconsole=1 -dbcache=$BTC_DBCACHE -par=0 -maxconnections=125 2>>"$LOG"; then
|
||||
log "Bitcoin Knots started"
|
||||
else
|
||||
log "Bitcoin Knots failed (may already exist)"
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user