fix: container orchestration overhaul — names, errors, Tor, restart

Container name resolution:
- New all_container_names() — single source of truth for every app's
  container name variants (bitcoin-knots/bitcoin/bitcoin-core, etc.)
- Covers all historical naming patterns and multi-container stacks

Start/Stop/Restart:
- No more silent failures (let _ = podman...). Every operation logs
  the command, checks exit status, and returns real errors to the UI.
- Restart uses stop+start fallback when podman restart fails
  (handles rootless podman loopback adapter errors)
- "No containers found" error when app doesn't exist

Tor helper:
- Install archipelago-tor-helper.path + .service in rootfs
- Enable the path unit so backend can manage Tor as non-root
- Copy tor-helper.sh to /opt/archipelago/scripts/

Verified: container with proper caps can stop/start/restart cleanly.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
This commit is contained in:
Dorian 2026-03-29 19:26:21 +01:00
parent fbabbd0722
commit 610e51500b
3 changed files with 152 additions and 72 deletions

View File

@ -281,6 +281,66 @@ pub(super) fn get_memory_limit(app_id: &str) -> &'static str {
}
/// Get all container names for an app (handles multi-container apps like mempool)
/// All known container name variants for a given app ID.
/// This is the single source of truth for container name resolution.
/// Every name that could appear in `podman ps` for this app must be listed here.
pub(super) fn all_container_names(package_id: &str) -> Vec<String> {
let base = package_id.to_string();
let archy = format!("archy-{}", package_id);
match package_id {
// Bitcoin: multiple historical names
"bitcoin" | "bitcoin-core" | "bitcoin-knots" => vec![
"bitcoin-knots".into(), "bitcoin".into(), "bitcoin-core".into(),
"archy-bitcoin-knots".into(), "archy-bitcoin".into(),
"bitcoin-ui".into(),
],
// LND + UI
"lnd" => vec!["lnd".into(), "archy-lnd".into(), "archy-lnd-ui".into()],
// Electrumx: multiple aliases
"electrumx" | "electrs" | "mempool-electrs" => vec![
"electrumx".into(), "electrs".into(), "mempool-electrs".into(),
"archy-electrumx".into(), "archy-electrs-ui".into(),
],
// Mempool: multi-container stack
"mempool" | "mempool-web" => vec![
"mempool".into(), "mempool-web".into(), "mempool-api".into(),
"archy-mempool-web".into(), "archy-mempool-api".into(),
"archy-mempool-db".into(), "mysql-mempool".into(),
],
// BTCPay: multi-container + multiple aliases
"btcpay-server" | "btcpayserver" | "btcpay" => vec![
"btcpay-server".into(), "btcpay".into(), "btcpayserver".into(),
"archy-btcpay".into(), "archy-btcpay-db".into(), "archy-nbxplorer".into(),
],
// Home Assistant: two naming conventions
"homeassistant" | "home-assistant" => vec![
"homeassistant".into(), "home-assistant".into(),
"archy-homeassistant".into(),
],
// Fedimint: multiple related containers
"fedimint" => vec![
"fedimint".into(), "fedimintd".into(),
"fedimint-ui".into(), "archy-fedimint".into(),
"fedimint-gateway".into(),
],
"fedimint-gateway" => vec!["fedimint-gateway".into()],
// Immich: multi-container
"immich" => vec![
"immich_postgres".into(), "immich_redis".into(), "immich_server".into(),
],
// Penpot: multi-container
"penpot" | "penpot-frontend" => vec![
"penpot-postgres".into(), "penpot-valkey".into(),
"penpot-backend".into(), "penpot-exporter".into(), "penpot-frontend".into(),
],
// Default: exact name + archy- prefix
_ => vec![base, archy],
}
}
/// Find all running/stopped containers that belong to a given app.
/// Uses the canonical name list from all_container_names().
pub(super) async fn get_containers_for_app(package_id: &str) -> Result<Vec<String>> {
validate_app_id(package_id)?;
let output = tokio::process::Command::new("podman")
@ -291,48 +351,11 @@ pub(super) async fn get_containers_for_app(package_id: &str) -> Result<Vec<Strin
let stdout = String::from_utf8_lossy(&output.stdout);
let all: Vec<&str> = stdout.lines().filter(|s| !s.is_empty()).collect();
let patterns: Vec<String> = match package_id {
"mempool" | "mempool-web" => {
vec![
"electrumx".into(),
"mempool-electrs".into(),
"mempool-api".into(),
"archy-mempool-api".into(),
"archy-mempool-web".into(),
"mempool".into(),
"archy-mempool-db".into(),
"mysql-mempool".into(),
]
}
"fedimint" => vec![
"fedimint".into(),
"fedimint-ui".into(),
"archy-fedimint".into(),
"fedimint-gateway".into(),
],
"fedimint-gateway" => vec!["fedimint-gateway".into()],
"immich" => vec![
"immich_postgres".into(),
"immich_redis".into(),
"immich_server".into(),
],
"penpot" | "penpot-frontend" => vec![
"penpot-postgres".into(),
"penpot-valkey".into(),
"penpot-backend".into(),
"penpot-exporter".into(),
"penpot-frontend".into(),
],
_ => vec![package_id.to_string(), format!("archy-{}", package_id)],
};
let patterns = all_container_names(package_id);
let mut result = Vec::new();
for name in all {
for pat in &patterns {
if name == pat {
result.push(name.to_string());
break;
}
if patterns.iter().any(|p| p == name) {
result.push(name.to_string());
}
}
Ok(result)

View File

@ -34,6 +34,10 @@ impl RpcHandler {
validate_app_id(package_id)?;
let to_start = ordered_containers_for_start(package_id).await?;
if to_start.is_empty() {
tracing::warn!("package.start {}: no containers found", package_id);
return Err(anyhow::anyhow!("No containers found for {}", package_id));
}
// Clear user-stopped flag — user explicitly started this app
crate::crash_recovery::clear_user_stopped(&self.config.data_dir, package_id).await;
@ -41,13 +45,24 @@ impl RpcHandler {
crate::crash_recovery::clear_user_stopped(&self.config.data_dir, name).await;
}
for name in to_start {
let _ = tokio::process::Command::new("podman")
.args(["start", &name])
let mut errors = Vec::new();
for name in &to_start {
tracing::info!("Starting container: {}", name);
let out = tokio::process::Command::new("podman")
.args(["start", name])
.output()
.await;
.await
.context(format!("Failed to exec podman start {}", name))?;
if !out.status.success() {
let stderr = String::from_utf8_lossy(&out.stderr).trim().to_string();
tracing::error!("Failed to start {}: {}", name, stderr);
errors.push(format!("{}: {}", name, stderr));
}
}
if !errors.is_empty() {
return Err(anyhow::anyhow!("Start failed: {}", errors.join("; ")));
}
Ok(serde_json::Value::Null)
}
@ -63,31 +78,36 @@ impl RpcHandler {
.ok_or_else(|| anyhow::anyhow!("Missing package id"))?;
validate_app_id(package_id)?;
// Mark as user-stopped so health monitor and crash recovery don't auto-restart
crate::crash_recovery::mark_user_stopped(&self.config.data_dir, package_id).await;
let containers = get_containers_for_app(package_id).await?;
if containers.is_empty() {
let container_name = format!("archy-{}", package_id);
crate::crash_recovery::mark_user_stopped(&self.config.data_dir, &container_name)
.await;
let _ = tokio::process::Command::new("podman")
.args(["stop", "-t", stop_timeout_secs(&container_name), &container_name])
.output()
.await;
return Ok(serde_json::Value::Null);
tracing::warn!("package.stop {}: no containers found", package_id);
return Err(anyhow::anyhow!("No containers found for {}", package_id));
}
// Mark as user-stopped so health monitor and crash recovery don't auto-restart
crate::crash_recovery::mark_user_stopped(&self.config.data_dir, package_id).await;
for name in &containers {
crate::crash_recovery::mark_user_stopped(&self.config.data_dir, name).await;
}
for name in containers {
let _ = tokio::process::Command::new("podman")
.args(["stop", "-t", stop_timeout_secs(&name), &name])
let mut errors = Vec::new();
for name in &containers {
tracing::info!("Stopping container: {} (timeout: {}s)", name, stop_timeout_secs(name));
let out = tokio::process::Command::new("podman")
.args(["stop", "-t", stop_timeout_secs(name), name])
.output()
.await;
.await
.context(format!("Failed to exec podman stop {}", name))?;
if !out.status.success() {
let stderr = String::from_utf8_lossy(&out.stderr).trim().to_string();
tracing::error!("Failed to stop {}: {}", name, stderr);
errors.push(format!("{}: {}", name, stderr));
}
}
if !errors.is_empty() {
return Err(anyhow::anyhow!("Stop failed: {}", errors.join("; ")));
}
Ok(serde_json::Value::Null)
}
@ -105,21 +125,47 @@ impl RpcHandler {
let containers = get_containers_for_app(package_id).await?;
if containers.is_empty() {
let container_name = format!("archy-{}", package_id);
let _ = tokio::process::Command::new("podman")
.args(["restart", &container_name])
.output()
.await;
return Ok(serde_json::Value::Null);
tracing::warn!("package.restart {}: no containers found", package_id);
return Err(anyhow::anyhow!("No containers found for {}", package_id));
}
for name in containers {
let _ = tokio::process::Command::new("podman")
.args(["restart", &name])
let mut errors = Vec::new();
for name in &containers {
tracing::info!("Restarting container: {}", name);
let out = tokio::process::Command::new("podman")
.args(["restart", "-t", stop_timeout_secs(name), name])
.output()
.await;
.await
.context(format!("Failed to exec podman restart {}", name))?;
if !out.status.success() {
let stderr = String::from_utf8_lossy(&out.stderr).trim().to_string();
tracing::warn!("podman restart {} failed: {}, trying stop+start", name, stderr);
// Fallback: stop then start (handles rootless podman loopback issues)
let _ = tokio::process::Command::new("podman")
.args(["stop", "-t", stop_timeout_secs(name), name])
.output()
.await;
let start_out = tokio::process::Command::new("podman")
.args(["start", name])
.output()
.await
.context(format!("Failed to exec podman start {}", name))?;
if !start_out.status.success() {
let start_err = String::from_utf8_lossy(&start_out.stderr).trim().to_string();
tracing::error!("stop+start {} also failed: {}", name, start_err);
errors.push(format!("{}: {}", name, start_err));
} else {
tracing::info!("Restarted {} via stop+start fallback", name);
}
}
}
if !errors.is_empty() {
return Err(anyhow::anyhow!("Restart failed: {}", errors.join("; ")));
}
Ok(serde_json::Value::Null)
}

View File

@ -339,12 +339,15 @@ COPY archipelago-doctor.service /etc/systemd/system/archipelago-doctor.service
COPY archipelago-doctor.timer /etc/systemd/system/archipelago-doctor.timer
COPY archipelago-reconcile.service /etc/systemd/system/archipelago-reconcile.service
COPY archipelago-reconcile.timer /etc/systemd/system/archipelago-reconcile.timer
COPY archipelago-tor-helper.service /etc/systemd/system/archipelago-tor-helper.service
COPY archipelago-tor-helper.path /etc/systemd/system/archipelago-tor-helper.path
# Copy container doctor + reconcile scripts (referenced by the services above)
RUN mkdir -p /home/archipelago/archy/scripts
COPY container-doctor.sh /home/archipelago/archy/scripts/container-doctor.sh
COPY reconcile-containers.sh /home/archipelago/archy/scripts/reconcile-containers.sh
RUN chmod +x /home/archipelago/archy/scripts/*.sh && \
COPY tor-helper.sh /opt/archipelago/scripts/tor-helper.sh
RUN chmod +x /home/archipelago/archy/scripts/*.sh /opt/archipelago/scripts/*.sh && \
chown -R archipelago:archipelago /home/archipelago/archy
# Enable services
@ -357,7 +360,8 @@ RUN systemctl enable NetworkManager || true && \
systemctl enable chrony || true && \
systemctl enable archipelago-update.timer || true && \
systemctl enable archipelago-doctor.timer || true && \
systemctl enable archipelago-reconcile.timer || true
systemctl enable archipelago-reconcile.timer || true && \
systemctl enable archipelago-tor-helper.path || true
# Remove policy-rc.d so services can start on first boot
RUN rm -f /usr/sbin/policy-rc.d
@ -424,7 +428,7 @@ NGINXCONF
cp "$SCRIPT_DIR/configs/archipelago-reconcile.service" "$WORK_DIR/archipelago-reconcile.service"
cp "$SCRIPT_DIR/configs/archipelago-reconcile.timer" "$WORK_DIR/archipelago-reconcile.timer"
# Copy the actual scripts the services reference
for s in container-doctor.sh reconcile-containers.sh; do
for s in container-doctor.sh reconcile-containers.sh tor-helper.sh; do
if [ -f "$SCRIPT_DIR/../scripts/$s" ]; then
cp "$SCRIPT_DIR/../scripts/$s" "$WORK_DIR/$s"
fi
@ -432,6 +436,13 @@ NGINXCONF
echo " Using container doctor + reconcile timers from configs/"
fi
# Copy Tor helper path-activated service (allows backend to manage Tor as non-root)
if [ -f "$SCRIPT_DIR/configs/archipelago-tor-helper.service" ]; then
cp "$SCRIPT_DIR/configs/archipelago-tor-helper.service" "$WORK_DIR/archipelago-tor-helper.service"
cp "$SCRIPT_DIR/configs/archipelago-tor-helper.path" "$WORK_DIR/archipelago-tor-helper.path"
echo " Using tor-helper path unit from configs/"
fi
# Use archipelago.service from configs/ (User=root for Podman container access)
if [ -f "$SCRIPT_DIR/configs/archipelago.service" ]; then
cp "$SCRIPT_DIR/configs/archipelago.service" "$WORK_DIR/archipelago.service"