archy/scripts/container-doctor.sh
Dorian d6441082fd fix: use rootless podman to check conmon ownership in doctor
Critical bug: the doctor runs as root but containers are rootless
under the archipelago user. When checking if a conmon process has an
associated container, the root podman database was queried (empty),
causing ALL conmon processes to be identified as orphaned and killed.
This terminated running containers every 30 minutes.

Fix: use sudo -u archipelago to query the rootless podman database.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
2026-03-30 23:22:28 +01:00

427 lines
14 KiB
Bash
Executable File

#!/bin/bash
#
# Container Doctor — diagnose and fix common container health issues
#
# Usage:
# sudo ./scripts/container-doctor.sh # Run locally on node
# ./scripts/container-doctor.sh user@host # Run remotely via SSH
#
# Fixes:
# 1. Stale podman ps/stats processes (>10 = pileup)
# 2. Orphaned conmon/crun processes holding ports
# 3. System tor conflicting with container tor
# 4. Tor hidden service directory permissions (must be 700)
# 5. SearXNG read-only root / cap-drop ALL
# 6. Bitcoin Knots prune+txindex conflict
# 7. Containers stuck with exit code 127 (binary not found)
# 8. Stopped core containers (rootless restart policy workaround)
#
# Safe to run multiple times (idempotent). Never blocks deploy (exit 0 always).
#
set -o pipefail
# Source pinned image versions (single source of truth)
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
[ -f "$SCRIPT_DIR/image-versions.sh" ] && . "$SCRIPT_DIR/image-versions.sh"
FIXES_APPLIED=0
CHECKS_PASSED=0
FIX_NAMES=()
log() { echo "[$(date +%H:%M:%S)] DOCTOR: $*"; }
run_fix() {
local name="$1"
shift
if "$@"; then
FIXES_APPLIED=$((FIXES_APPLIED + 1))
FIX_NAMES+=("$name")
else
CHECKS_PASSED=$((CHECKS_PASSED + 1))
fi
}
# ── Fix 1: Stale podman processes ────────────────────────────
fix_stale_podman() {
local count
count=$(pgrep -f "podman (ps|stats)" 2>/dev/null | wc -l)
count=${count:-0}
if [ "$count" -gt 10 ]; then
log "Killing $count stale podman ps/stats processes"
pkill -f "podman (ps|stats)" 2>/dev/null || true
sleep 2
local after
after=$(pgrep -f "podman (ps|stats)" 2>/dev/null | wc -l)
after=${after:-0}
log "Reduced from $count to $after"
return 0
fi
return 1
}
# ── Fix 2: Orphaned conmon holding ports ─────────────────────
fix_orphaned_conmon() {
local fixed=false
# Find conmon processes whose containers no longer exist
local pids
pids=$(pgrep -f "conmon.*--exit-command" 2>/dev/null || true)
if [ -z "$pids" ]; then
return 1
fi
# Doctor runs as root but containers are rootless under archipelago user.
# Must check container existence using the rootless podman database.
local PODMANCMD="sudo -u archipelago XDG_RUNTIME_DIR=/run/user/1000 podman"
for pid in $pids; do
# Extract container ID from conmon args
local cid
cid=$(tr '\0' ' ' < /proc/"$pid"/cmdline 2>/dev/null | grep -oP '(?<=-c )[a-f0-9]{64}' || true)
if [ -z "$cid" ]; then
continue
fi
# Check if container still exists in rootless podman
if ! $PODMANCMD inspect "$cid" &>/dev/null; then
local port_info
port_info=$(ss -tlnp 2>/dev/null | grep "pid=$pid" | grep -oP ':\K\d+' | head -3 | tr '\n' ',' | sed 's/,$//')
log "Killing orphaned conmon pid=$pid (ports: ${port_info:-none})"
kill "$pid" 2>/dev/null || kill -9 "$pid" 2>/dev/null || true
fixed=true
fi
done
$fixed && return 0 || return 1
}
# ── Fix 3: Ensure system Tor is running (preferred over container) ──
fix_system_tor_conflict() {
# System Tor is preferred over container Tor.
# If archy-tor container exists, remove it and use system Tor instead.
if podman ps -a --format '{{.Names}}' 2>/dev/null | grep -qE '^archy-tor$'; then
podman stop archy-tor 2>/dev/null || true
podman rm -f archy-tor 2>/dev/null || true
log "Removed archy-tor container (system Tor is preferred)"
fi
# Ensure system Tor is enabled and running
if command -v tor >/dev/null 2>&1; then
if ! systemctl is-active tor@default >/dev/null 2>&1; then
systemctl enable tor tor@default 2>/dev/null || true
systemctl start tor tor@default 2>/dev/null || true
log "Started system Tor"
return 0
fi
fi
return 1
}
# ── Fix 4: Tor hidden service permissions ────────────────────
fix_tor_permissions() {
local fixed=false
local tor_dirs=("/var/lib/archipelago/tor" "/var/lib/tor")
for base in "${tor_dirs[@]}"; do
if [ ! -d "$base" ]; then
continue
fi
while IFS= read -r dir; do
local perms
perms=$(stat -c '%a' "$dir" 2>/dev/null)
if [ "$perms" != "700" ]; then
chmod 700 "$dir"
log "Fixed permissions on $dir ($perms -> 700)"
fixed=true
fi
done < <(find "$base" -maxdepth 1 -name "hidden_service_*" -type d 2>/dev/null)
done
# If we fixed permissions, restart system Tor to pick up the changes
if $fixed; then
systemctl restart tor@default 2>/dev/null || true
return 0
fi
return 1
}
# ── Fix 5: SearXNG read-only / cap-drop ─────────────────────
fix_searxng() {
if ! podman ps -a --format '{{.Names}}' 2>/dev/null | grep -q '^searxng$'; then
return 1
fi
local state
state=$(podman inspect searxng --format '{{.State.Status}}' 2>/dev/null || true)
local readonly_root
readonly_root=$(podman inspect searxng --format '{{.HostConfig.ReadonlyRootfs}}' 2>/dev/null || true)
local cap_drop
cap_drop=$(podman inspect searxng --format '{{.HostConfig.CapDrop}}' 2>/dev/null || true)
# Fix if: exited, or has read-only root, or has cap-drop ALL
local needs_fix=false
if [ "$state" = "exited" ]; then
needs_fix=true
fi
if [ "$readonly_root" = "true" ]; then
needs_fix=true
fi
if [[ "$cap_drop" == *"ALL"* ]] || [[ "$cap_drop" == *"all"* ]]; then
needs_fix=true
fi
if ! $needs_fix; then
return 1
fi
log "Recreating SearXNG (readonly=$readonly_root, cap_drop=$cap_drop, state=$state)"
# Get current port mapping
local port
port=$(podman inspect searxng --format '{{range $k,$v := .HostConfig.PortBindings}}{{$k}}={{range $v}}{{.HostPort}}{{end}}{{println}}{{end}}' 2>/dev/null | head -1)
local host_port="${port##*=}"
host_port="${host_port:-8888}"
# Kill any stale conmon holding the port
local conmon_pid
conmon_pid=$(ss -tlnp 2>/dev/null | grep ":${host_port} " | grep -oP 'pid=\K\d+' | head -1)
podman stop searxng 2>/dev/null || true
podman rm -f searxng 2>/dev/null || true
if [ -n "$conmon_pid" ]; then
kill -9 "$conmon_pid" 2>/dev/null || true
sleep 2
fi
podman run -d \
--name searxng \
--restart=unless-stopped \
--security-opt=no-new-privileges:true \
--tmpfs /tmp:rw,noexec,nosuid,size=256m \
-v searxng-config:/etc/searxng:rw \
-v searxng-cache:/var/cache/searxng:rw \
-p "${host_port}:8080" \
--memory=512m \
"${SEARXNG_IMAGE}" 2>&1 || true
log "SearXNG recreated (no readonly, no cap-drop ALL)"
return 0
}
# ── Fix 6: Bitcoin Knots prune+txindex conflict ──────────────
fix_bitcoin_txindex() {
if ! podman ps -a --format '{{.Names}}' 2>/dev/null | grep -q '^bitcoin-knots$'; then
return 1
fi
# Check if bitcoin.conf has prune enabled
local conf="/var/lib/archipelago/bitcoin/bitcoin.conf"
if [ ! -f "$conf" ] || ! grep -q '^prune=' "$conf"; then
return 1
fi
# Check if container args include txindex
local cmd
cmd=$(podman inspect bitcoin-knots --format '{{json .Config.Cmd}}' 2>/dev/null || true)
if ! echo "$cmd" | grep -q "txindex"; then
return 1
fi
log "Bitcoin Knots: prune+txindex conflict detected"
# Get current config
local image
image=$(podman inspect bitcoin-knots --format '{{.ImageName}}' 2>/dev/null)
local network
network=$(podman inspect bitcoin-knots --format '{{.HostConfig.NetworkMode}}' 2>/dev/null)
# Read per-installation RPC password
local SECRETS_DIR="/var/lib/archipelago/secrets"
local BTC_RPC_PASS="archipelago"
if [ -f "$SECRETS_DIR/bitcoin-rpc-password" ]; then
BTC_RPC_PASS=$(cat "$SECRETS_DIR/bitcoin-rpc-password")
fi
# Ensure bitcoin.conf has all RPC settings
if ! grep -q 'rpcuser=' "$conf"; then
cat > "$conf" <<BCONF
server=1
prune=550
rpcuser=archipelago
rpcpassword=$BTC_RPC_PASS
rpcbind=0.0.0.0
rpcallowip=127.0.0.1/32
rpcallowip=10.88.0.0/16
rpcport=8332
listen=1
printtoconsole=1
BCONF
log "Updated bitcoin.conf with full RPC settings"
fi
# Remove stale txindex if present
if [ -d "/var/lib/archipelago/bitcoin/indexes/txindex" ]; then
find /var/lib/archipelago/bitcoin/indexes/txindex -type f -delete 2>/dev/null
rmdir /var/lib/archipelago/bitcoin/indexes/txindex 2>/dev/null || true
log "Removed stale txindex directory"
fi
# Recreate without txindex
podman stop bitcoin-knots 2>/dev/null || true
podman rm -f bitcoin-knots 2>/dev/null || true
sleep 2
# Kill stale conmon on port 8332/8333
for p in 8332 8333; do
local cpid
cpid=$(ss -tlnp 2>/dev/null | grep ":${p} " | grep -oP 'pid=\K\d+' | head -1)
if [ -n "$cpid" ]; then
kill -9 "$cpid" 2>/dev/null || true
fi
done
sleep 1
local net_arg=""
if [ -n "$network" ] && [ "$network" != "bridge" ] && [ "$network" != "host" ]; then
net_arg="--network=$network"
elif [ "$network" = "host" ]; then
net_arg="--network=host"
else
net_arg="--network=archy-net"
fi
podman run -d \
--name bitcoin-knots \
--restart=always \
$net_arg \
-p 8332:8332 \
-p 8333:8333 \
-v /var/lib/archipelago/bitcoin:/home/bitcoin/.bitcoin \
--memory=2g \
--cap-drop=ALL \
--cap-add=CHOWN \
--cap-add=FOWNER \
--cap-add=SETUID \
--cap-add=SETGID \
--cap-add=DAC_OVERRIDE \
--security-opt=no-new-privileges:true \
--health-cmd="bitcoin-cli -rpcuser=archipelago -rpcpassword=$BTC_RPC_PASS getblockchaininfo || exit 1" \
--health-interval=30s \
--health-retries=3 \
"$image" 2>&1 || true
log "Bitcoin Knots recreated without txindex (prune mode)"
return 0
}
# ── Fix 7: Exit code 127 containers ─────────────────────────
fix_exit_127() {
local containers
containers=$(podman ps -a --format '{{.Names}} {{.Status}}' 2>/dev/null | grep 'Exited (127)' | awk '{print $1}' || true)
if [ -z "$containers" ]; then
return 1
fi
local fixed_names=()
for name in $containers; do
# Skip containers handled by other fixes
if [ "$name" = "searxng" ]; then
continue
fi
log "Container $name has exit code 127 — recreating"
# Get image and create command for recreation
local image
image=$(podman inspect "$name" --format '{{.ImageName}}' 2>/dev/null || true)
local create_cmd
create_cmd=$(podman inspect "$name" --format '{{json .Config.CreateCommand}}' 2>/dev/null || true)
podman rm -f "$name" 2>/dev/null || true
if [ -n "$create_cmd" ] && [ "$create_cmd" != "null" ]; then
# Re-run the original create command (strip the leading "podman" and "run")
local recreate_args
recreate_args=$(echo "$create_cmd" | python3 -c "
import json, sys
args = json.load(sys.stdin)
# Skip 'podman' and 'run', output the rest
print(' '.join(['\"' + a + '\"' if ' ' in a else a for a in args[2:]]))
" 2>/dev/null || true)
if [ -n "$recreate_args" ]; then
eval "podman run $recreate_args" 2>&1 || true
fixed_names+=("$name")
log "Recreated $name from original args"
else
fixed_names+=("$name(removed)")
log "Removed $name — will be recreated on next deploy"
fi
else
fixed_names+=("$name(removed)")
log "Removed $name — will be recreated on next deploy"
fi
done
[ ${#fixed_names[@]} -gt 0 ] && return 0 || return 1
}
# ── Fix 8: Restart stopped core containers ──────────────────
# Rootless Podman 4.x restart policies don't auto-restart on crash.
# This check restarts any exited core containers (tiers 0-2).
fix_stopped_core_containers() {
local core_containers="bitcoin-knots lnd electrumx mempool-api archy-mempool-web archy-mempool-db archy-btcpay-db archy-nbxplorer btcpay-server"
local restarted=()
# Doctor runs as root but containers are rootless under archipelago user
local PODMANCMD="sudo -u archipelago XDG_RUNTIME_DIR=/run/user/1000 podman"
for name in $core_containers; do
local state
state=$($PODMANCMD inspect "$name" --format '{{.State.Status}}' 2>/dev/null || echo "missing")
if [ "$state" = "exited" ] || [ "$state" = "stopped" ]; then
log "Restarting stopped container: $name"
$PODMANCMD start "$name" 2>/dev/null && restarted+=("$name") || true
fi
done
[ ${#restarted[@]} -gt 0 ] && return 0 || return 1
}
# ── Main ─────────────────────────────────────────────────────
# If remote host provided, run via SSH
if [ -n "$1" ] && [ "$1" != "--local" ]; then
REMOTE_HOST="$1"
SSH_KEY="${ARCHIPELAGO_SSH_KEY:-$HOME/.ssh/archipelago-deploy}"
SSH_OPTS="-o StrictHostKeyChecking=no -o ServerAliveInterval=15 -o ServerAliveCountMax=4 -i $SSH_KEY"
log "Running container doctor on $REMOTE_HOST"
# Copy script to remote and execute
scp $SSH_OPTS "$0" "$REMOTE_HOST:/tmp/container-doctor.sh" 2>/dev/null
ssh $SSH_OPTS "$REMOTE_HOST" "sudo bash /tmp/container-doctor.sh --local" 2>&1
exit 0
fi
# Running locally (on the node itself)
log "Starting container health check"
run_fix "stale-podman" fix_stale_podman
run_fix "orphaned-conmon" fix_orphaned_conmon
run_fix "system-tor" fix_system_tor_conflict
run_fix "tor-permissions" fix_tor_permissions
run_fix "searxng" fix_searxng
run_fix "bitcoin-txindex" fix_bitcoin_txindex
run_fix "exit-127" fix_exit_127
run_fix "stopped-core" fix_stopped_core_containers
echo ""
if [ $FIXES_APPLIED -gt 0 ]; then
log "Done: $FIXES_APPLIED fixes applied (${FIX_NAMES[*]}), $CHECKS_PASSED checks passed"
else
log "Done: all $CHECKS_PASSED checks passed — no fixes needed"
fi
exit 0