From 610e51500bc31a9eaff843964bc30f0e064f2bbf Mon Sep 17 00:00:00 2001 From: Dorian Date: Sun, 29 Mar 2026 19:26:21 +0100 Subject: [PATCH] =?UTF-8?q?fix:=20container=20orchestration=20overhaul=20?= =?UTF-8?q?=E2=80=94=20names,=20errors,=20Tor,=20restart?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Container name resolution: - New all_container_names() — single source of truth for every app's container name variants (bitcoin-knots/bitcoin/bitcoin-core, etc.) - Covers all historical naming patterns and multi-container stacks Start/Stop/Restart: - No more silent failures (let _ = podman...). Every operation logs the command, checks exit status, and returns real errors to the UI. - Restart uses stop+start fallback when podman restart fails (handles rootless podman loopback adapter errors) - "No containers found" error when app doesn't exist Tor helper: - Install archipelago-tor-helper.path + .service in rootfs - Enable the path unit so backend can manage Tor as non-root - Copy tor-helper.sh to /opt/archipelago/scripts/ Verified: container with proper caps can stop/start/restart cleanly. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../archipelago/src/api/rpc/package/config.rs | 103 ++++++++++------- .../src/api/rpc/package/runtime.rs | 104 +++++++++++++----- image-recipe/build-auto-installer-iso.sh | 17 ++- 3 files changed, 152 insertions(+), 72 deletions(-) diff --git a/core/archipelago/src/api/rpc/package/config.rs b/core/archipelago/src/api/rpc/package/config.rs index 17656631..05c0bdf5 100644 --- a/core/archipelago/src/api/rpc/package/config.rs +++ b/core/archipelago/src/api/rpc/package/config.rs @@ -281,6 +281,66 @@ pub(super) fn get_memory_limit(app_id: &str) -> &'static str { } /// Get all container names for an app (handles multi-container apps like mempool) +/// All known container name variants for a given app ID. +/// This is the single source of truth for container name resolution. +/// Every name that could appear in `podman ps` for this app must be listed here. +pub(super) fn all_container_names(package_id: &str) -> Vec { + let base = package_id.to_string(); + let archy = format!("archy-{}", package_id); + + match package_id { + // Bitcoin: multiple historical names + "bitcoin" | "bitcoin-core" | "bitcoin-knots" => vec![ + "bitcoin-knots".into(), "bitcoin".into(), "bitcoin-core".into(), + "archy-bitcoin-knots".into(), "archy-bitcoin".into(), + "bitcoin-ui".into(), + ], + // LND + UI + "lnd" => vec!["lnd".into(), "archy-lnd".into(), "archy-lnd-ui".into()], + // Electrumx: multiple aliases + "electrumx" | "electrs" | "mempool-electrs" => vec![ + "electrumx".into(), "electrs".into(), "mempool-electrs".into(), + "archy-electrumx".into(), "archy-electrs-ui".into(), + ], + // Mempool: multi-container stack + "mempool" | "mempool-web" => vec![ + "mempool".into(), "mempool-web".into(), "mempool-api".into(), + "archy-mempool-web".into(), "archy-mempool-api".into(), + "archy-mempool-db".into(), "mysql-mempool".into(), + ], + // BTCPay: multi-container + multiple aliases + "btcpay-server" | "btcpayserver" | "btcpay" => vec![ + "btcpay-server".into(), "btcpay".into(), "btcpayserver".into(), + "archy-btcpay".into(), "archy-btcpay-db".into(), "archy-nbxplorer".into(), + ], + // Home Assistant: two naming conventions + "homeassistant" | "home-assistant" => vec![ + "homeassistant".into(), "home-assistant".into(), + "archy-homeassistant".into(), + ], + // Fedimint: multiple related containers + "fedimint" => vec![ + "fedimint".into(), "fedimintd".into(), + "fedimint-ui".into(), "archy-fedimint".into(), + "fedimint-gateway".into(), + ], + "fedimint-gateway" => vec!["fedimint-gateway".into()], + // Immich: multi-container + "immich" => vec![ + "immich_postgres".into(), "immich_redis".into(), "immich_server".into(), + ], + // Penpot: multi-container + "penpot" | "penpot-frontend" => vec![ + "penpot-postgres".into(), "penpot-valkey".into(), + "penpot-backend".into(), "penpot-exporter".into(), "penpot-frontend".into(), + ], + // Default: exact name + archy- prefix + _ => vec![base, archy], + } +} + +/// Find all running/stopped containers that belong to a given app. +/// Uses the canonical name list from all_container_names(). pub(super) async fn get_containers_for_app(package_id: &str) -> Result> { validate_app_id(package_id)?; let output = tokio::process::Command::new("podman") @@ -291,48 +351,11 @@ pub(super) async fn get_containers_for_app(package_id: &str) -> Result = stdout.lines().filter(|s| !s.is_empty()).collect(); - let patterns: Vec = match package_id { - "mempool" | "mempool-web" => { - vec![ - "electrumx".into(), - "mempool-electrs".into(), - "mempool-api".into(), - "archy-mempool-api".into(), - "archy-mempool-web".into(), - "mempool".into(), - "archy-mempool-db".into(), - "mysql-mempool".into(), - ] - } - "fedimint" => vec![ - "fedimint".into(), - "fedimint-ui".into(), - "archy-fedimint".into(), - "fedimint-gateway".into(), - ], - "fedimint-gateway" => vec!["fedimint-gateway".into()], - "immich" => vec![ - "immich_postgres".into(), - "immich_redis".into(), - "immich_server".into(), - ], - "penpot" | "penpot-frontend" => vec![ - "penpot-postgres".into(), - "penpot-valkey".into(), - "penpot-backend".into(), - "penpot-exporter".into(), - "penpot-frontend".into(), - ], - _ => vec![package_id.to_string(), format!("archy-{}", package_id)], - }; - + let patterns = all_container_names(package_id); let mut result = Vec::new(); for name in all { - for pat in &patterns { - if name == pat { - result.push(name.to_string()); - break; - } + if patterns.iter().any(|p| p == name) { + result.push(name.to_string()); } } Ok(result) diff --git a/core/archipelago/src/api/rpc/package/runtime.rs b/core/archipelago/src/api/rpc/package/runtime.rs index c1143fe5..27773392 100644 --- a/core/archipelago/src/api/rpc/package/runtime.rs +++ b/core/archipelago/src/api/rpc/package/runtime.rs @@ -34,6 +34,10 @@ impl RpcHandler { validate_app_id(package_id)?; let to_start = ordered_containers_for_start(package_id).await?; + if to_start.is_empty() { + tracing::warn!("package.start {}: no containers found", package_id); + return Err(anyhow::anyhow!("No containers found for {}", package_id)); + } // Clear user-stopped flag — user explicitly started this app crate::crash_recovery::clear_user_stopped(&self.config.data_dir, package_id).await; @@ -41,13 +45,24 @@ impl RpcHandler { crate::crash_recovery::clear_user_stopped(&self.config.data_dir, name).await; } - for name in to_start { - let _ = tokio::process::Command::new("podman") - .args(["start", &name]) + let mut errors = Vec::new(); + for name in &to_start { + tracing::info!("Starting container: {}", name); + let out = tokio::process::Command::new("podman") + .args(["start", name]) .output() - .await; + .await + .context(format!("Failed to exec podman start {}", name))?; + if !out.status.success() { + let stderr = String::from_utf8_lossy(&out.stderr).trim().to_string(); + tracing::error!("Failed to start {}: {}", name, stderr); + errors.push(format!("{}: {}", name, stderr)); + } } + if !errors.is_empty() { + return Err(anyhow::anyhow!("Start failed: {}", errors.join("; "))); + } Ok(serde_json::Value::Null) } @@ -63,31 +78,36 @@ impl RpcHandler { .ok_or_else(|| anyhow::anyhow!("Missing package id"))?; validate_app_id(package_id)?; - // Mark as user-stopped so health monitor and crash recovery don't auto-restart - crate::crash_recovery::mark_user_stopped(&self.config.data_dir, package_id).await; - let containers = get_containers_for_app(package_id).await?; if containers.is_empty() { - let container_name = format!("archy-{}", package_id); - crate::crash_recovery::mark_user_stopped(&self.config.data_dir, &container_name) - .await; - let _ = tokio::process::Command::new("podman") - .args(["stop", "-t", stop_timeout_secs(&container_name), &container_name]) - .output() - .await; - return Ok(serde_json::Value::Null); + tracing::warn!("package.stop {}: no containers found", package_id); + return Err(anyhow::anyhow!("No containers found for {}", package_id)); } + // Mark as user-stopped so health monitor and crash recovery don't auto-restart + crate::crash_recovery::mark_user_stopped(&self.config.data_dir, package_id).await; for name in &containers { crate::crash_recovery::mark_user_stopped(&self.config.data_dir, name).await; } - for name in containers { - let _ = tokio::process::Command::new("podman") - .args(["stop", "-t", stop_timeout_secs(&name), &name]) + + let mut errors = Vec::new(); + for name in &containers { + tracing::info!("Stopping container: {} (timeout: {}s)", name, stop_timeout_secs(name)); + let out = tokio::process::Command::new("podman") + .args(["stop", "-t", stop_timeout_secs(name), name]) .output() - .await; + .await + .context(format!("Failed to exec podman stop {}", name))?; + if !out.status.success() { + let stderr = String::from_utf8_lossy(&out.stderr).trim().to_string(); + tracing::error!("Failed to stop {}: {}", name, stderr); + errors.push(format!("{}: {}", name, stderr)); + } } + if !errors.is_empty() { + return Err(anyhow::anyhow!("Stop failed: {}", errors.join("; "))); + } Ok(serde_json::Value::Null) } @@ -105,21 +125,47 @@ impl RpcHandler { let containers = get_containers_for_app(package_id).await?; if containers.is_empty() { - let container_name = format!("archy-{}", package_id); - let _ = tokio::process::Command::new("podman") - .args(["restart", &container_name]) - .output() - .await; - return Ok(serde_json::Value::Null); + tracing::warn!("package.restart {}: no containers found", package_id); + return Err(anyhow::anyhow!("No containers found for {}", package_id)); } - for name in containers { - let _ = tokio::process::Command::new("podman") - .args(["restart", &name]) + let mut errors = Vec::new(); + for name in &containers { + tracing::info!("Restarting container: {}", name); + let out = tokio::process::Command::new("podman") + .args(["restart", "-t", stop_timeout_secs(name), name]) .output() - .await; + .await + .context(format!("Failed to exec podman restart {}", name))?; + + if !out.status.success() { + let stderr = String::from_utf8_lossy(&out.stderr).trim().to_string(); + tracing::warn!("podman restart {} failed: {}, trying stop+start", name, stderr); + + // Fallback: stop then start (handles rootless podman loopback issues) + let _ = tokio::process::Command::new("podman") + .args(["stop", "-t", stop_timeout_secs(name), name]) + .output() + .await; + let start_out = tokio::process::Command::new("podman") + .args(["start", name]) + .output() + .await + .context(format!("Failed to exec podman start {}", name))?; + + if !start_out.status.success() { + let start_err = String::from_utf8_lossy(&start_out.stderr).trim().to_string(); + tracing::error!("stop+start {} also failed: {}", name, start_err); + errors.push(format!("{}: {}", name, start_err)); + } else { + tracing::info!("Restarted {} via stop+start fallback", name); + } + } } + if !errors.is_empty() { + return Err(anyhow::anyhow!("Restart failed: {}", errors.join("; "))); + } Ok(serde_json::Value::Null) } diff --git a/image-recipe/build-auto-installer-iso.sh b/image-recipe/build-auto-installer-iso.sh index da858bd8..e97c69bf 100755 --- a/image-recipe/build-auto-installer-iso.sh +++ b/image-recipe/build-auto-installer-iso.sh @@ -339,12 +339,15 @@ COPY archipelago-doctor.service /etc/systemd/system/archipelago-doctor.service COPY archipelago-doctor.timer /etc/systemd/system/archipelago-doctor.timer COPY archipelago-reconcile.service /etc/systemd/system/archipelago-reconcile.service COPY archipelago-reconcile.timer /etc/systemd/system/archipelago-reconcile.timer +COPY archipelago-tor-helper.service /etc/systemd/system/archipelago-tor-helper.service +COPY archipelago-tor-helper.path /etc/systemd/system/archipelago-tor-helper.path # Copy container doctor + reconcile scripts (referenced by the services above) RUN mkdir -p /home/archipelago/archy/scripts COPY container-doctor.sh /home/archipelago/archy/scripts/container-doctor.sh COPY reconcile-containers.sh /home/archipelago/archy/scripts/reconcile-containers.sh -RUN chmod +x /home/archipelago/archy/scripts/*.sh && \ +COPY tor-helper.sh /opt/archipelago/scripts/tor-helper.sh +RUN chmod +x /home/archipelago/archy/scripts/*.sh /opt/archipelago/scripts/*.sh && \ chown -R archipelago:archipelago /home/archipelago/archy # Enable services @@ -357,7 +360,8 @@ RUN systemctl enable NetworkManager || true && \ systemctl enable chrony || true && \ systemctl enable archipelago-update.timer || true && \ systemctl enable archipelago-doctor.timer || true && \ - systemctl enable archipelago-reconcile.timer || true + systemctl enable archipelago-reconcile.timer || true && \ + systemctl enable archipelago-tor-helper.path || true # Remove policy-rc.d so services can start on first boot RUN rm -f /usr/sbin/policy-rc.d @@ -424,7 +428,7 @@ NGINXCONF cp "$SCRIPT_DIR/configs/archipelago-reconcile.service" "$WORK_DIR/archipelago-reconcile.service" cp "$SCRIPT_DIR/configs/archipelago-reconcile.timer" "$WORK_DIR/archipelago-reconcile.timer" # Copy the actual scripts the services reference - for s in container-doctor.sh reconcile-containers.sh; do + for s in container-doctor.sh reconcile-containers.sh tor-helper.sh; do if [ -f "$SCRIPT_DIR/../scripts/$s" ]; then cp "$SCRIPT_DIR/../scripts/$s" "$WORK_DIR/$s" fi @@ -432,6 +436,13 @@ NGINXCONF echo " Using container doctor + reconcile timers from configs/" fi + # Copy Tor helper path-activated service (allows backend to manage Tor as non-root) + if [ -f "$SCRIPT_DIR/configs/archipelago-tor-helper.service" ]; then + cp "$SCRIPT_DIR/configs/archipelago-tor-helper.service" "$WORK_DIR/archipelago-tor-helper.service" + cp "$SCRIPT_DIR/configs/archipelago-tor-helper.path" "$WORK_DIR/archipelago-tor-helper.path" + echo " Using tor-helper path unit from configs/" + fi + # Use archipelago.service from configs/ (User=root for Podman container access) if [ -f "$SCRIPT_DIR/configs/archipelago.service" ]; then cp "$SCRIPT_DIR/configs/archipelago.service" "$WORK_DIR/archipelago.service"